1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUSelectionDAGInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIMachineFunctionInfo.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/FloatingPointMode.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/UniformityAnalysis.h"
29#include "llvm/CodeGen/Analysis.h"
30#include "llvm/CodeGen/ByteProvider.h"
31#include "llvm/CodeGen/FunctionLoweringInfo.h"
32#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
33#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
34#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineLoopInfo.h"
38#include "llvm/CodeGen/PseudoSourceValueManager.h"
39#include "llvm/CodeGen/SDPatternMatch.h"
40#include "llvm/IR/DiagnosticInfo.h"
41#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/IntrinsicInst.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/KnownBits.h"
48#include "llvm/Support/ModRef.h"
49#include "llvm/Transforms/Utils/LowerAtomic.h"
50#include <optional>
51
52using namespace llvm;
53using namespace llvm::SDPatternMatch;
54
55#define DEBUG_TYPE "si-lower"
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59static cl::opt<bool>
60 DisableLoopAlignment("amdgpu-disable-loop-alignment",
61 cl::desc("Do not align and prefetch loops"),
62 cl::init(Val: false));
63
64static cl::opt<bool> UseDivergentRegisterIndexing(
65 "amdgpu-use-divergent-register-indexing", cl::Hidden,
66 cl::desc("Use indirect register addressing for divergent indexes"),
67 cl::init(Val: false));
68
69static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
70 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
71 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
72}
73
74static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
75 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
76 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
77}
78
79static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
82 if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) {
83 return AMDGPU::SGPR0 + Reg;
84 }
85 }
86 llvm_unreachable("Cannot allocate sgpr");
87}
88
89SITargetLowering::SITargetLowering(const TargetMachine &TM,
90 const GCNSubtarget &STI)
91 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
92 addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass);
93 addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass);
94
95 addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V32RegClass =
99 TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 32);
100 addRegisterClass(VT: MVT::f32, RC: V32RegClass);
101
102 addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass);
103
104 const TargetRegisterClass *V64RegClass =
105 TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 64);
106
107 addRegisterClass(VT: MVT::f64, RC: V64RegClass);
108 addRegisterClass(VT: MVT::v2f32, RC: V64RegClass);
109 addRegisterClass(VT: MVT::Untyped, RC: V64RegClass);
110
111 addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass);
112 addRegisterClass(VT: MVT::v3f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 96));
113
114 addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass);
115 addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass);
116
117 addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass);
118 addRegisterClass(VT: MVT::v4f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 128));
119
120 addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass);
121 addRegisterClass(VT: MVT::v5f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 160));
122
123 addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass);
124 addRegisterClass(VT: MVT::v6f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 192));
125
126 addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass);
127 addRegisterClass(VT: MVT::v3f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 192));
128
129 addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass);
130 addRegisterClass(VT: MVT::v7f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 224));
131
132 addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass);
133 addRegisterClass(VT: MVT::v8f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 256));
134
135 addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass);
136 addRegisterClass(VT: MVT::v4f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 256));
137
138 addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass);
139 addRegisterClass(VT: MVT::v9f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 288));
140
141 addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass);
142 addRegisterClass(VT: MVT::v10f32,
143 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 320));
144
145 addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(VT: MVT::v11f32,
147 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 352));
148
149 addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(VT: MVT::v12f32,
151 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 384));
152
153 addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass);
154 addRegisterClass(VT: MVT::v16f32,
155 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 512));
156
157 addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass);
158 addRegisterClass(VT: MVT::v8f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 512));
159
160 addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass);
161 addRegisterClass(VT: MVT::v16f64,
162 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 1024));
163
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
166 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass);
169 } else {
170 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass);
171 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass);
172 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass);
173 }
174
175 // Unless there are also VOP3P operations, not operations are really legal.
176 addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass);
177 addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass);
178 addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass);
179 addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass);
180 addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass);
181 addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass);
182 addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass);
191 }
192
193 addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass);
194 addRegisterClass(VT: MVT::v32f32,
195 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 1024));
196
197 computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
198
199 setMinFunctionAlignment(Align(4));
200 setPrefFunctionAlignment(Align(STI.getInstCacheLineSize()));
201
202 // The boolean content concept here is too inflexible. Compares only ever
203 // really produce a 1-bit result. Any copy/extend from these will turn into a
204 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
205 // it's what most targets use.
206 setBooleanContents(ZeroOrOneBooleanContent);
207 setBooleanVectorContents(ZeroOrOneBooleanContent);
208
209 // We need to custom lower vector stores from local memory
210 setOperationAction(Ops: ISD::LOAD,
211 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Action: Custom);
216
217 setOperationAction(Ops: ISD::STORE,
218 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221 MVT::i1, MVT::v32i32},
222 Action: Custom);
223
224 if (isTypeLegal(VT: MVT::bf16)) {
225 for (unsigned Opc :
226 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
227 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
228 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
229 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
230 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
231 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
232 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
233 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
234 ISD::SETCC}) {
235 setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote);
236 }
237
238 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand);
239
240 setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote);
241 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16);
242
243 setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal);
244 setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal);
245 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal);
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
249 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
250 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
251 }
252
253 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
254 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand);
255 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand);
256 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand);
257 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand);
258 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand);
259 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand);
260 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand);
261 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand);
262 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand);
263 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand);
264 setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand);
265 setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand);
266 setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand);
267 setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand);
268 setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand);
269
270 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
271 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
272 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand);
273 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand);
274 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand);
275 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand);
276 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand);
277
278 setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
279 setOperationAction(Ops: ISD::BlockAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
280 setOperationAction(Ops: ISD::ExternalSymbol, VTs: {MVT::i32, MVT::i64}, Action: Custom);
281
282 setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote);
283 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
284 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote);
285 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64);
286
287 setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom);
288
289 setOperationAction(Ops: ISD::SELECT_CC,
290 VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand);
291
292 setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote);
293 setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand);
294 AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32);
295
296 setOperationAction(Ops: ISD::TRUNCATE,
297 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
298 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
299 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
300 Action: Expand);
301 setOperationAction(Ops: ISD::FP_ROUND,
302 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
303 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
304 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
305 Action: Expand);
306
307 setOperationAction(Ops: ISD::SIGN_EXTEND_INREG,
308 VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
309 MVT::v3i16, MVT::v4i16, MVT::Other},
310 Action: Custom);
311
312 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
313 setOperationAction(Ops: ISD::BR_CC,
314 VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand);
315
316 setOperationAction(Ops: {ISD::ABS, ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal);
317 setOperationAction(Ops: {ISD::UADDO, ISD::USUBO}, VT: MVT::i64, Action: Legal);
318
319 setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal);
320 setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i64, Action: Legal);
321
322 setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64,
323 Action: Expand);
324
325 setOperationAction(Op: ISD::INLINEASM, VT: MVT::Other, Action: Custom);
326
327 // We only support LOAD/STORE and vector manipulation ops for vectors
328 // with > 4 elements.
329 for (MVT VT :
330 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
331 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
332 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
333 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
334 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
335 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
336 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
337 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
338 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
339 switch (Op) {
340 case ISD::LOAD:
341 case ISD::STORE:
342 case ISD::BUILD_VECTOR:
343 case ISD::BITCAST:
344 case ISD::UNDEF:
345 case ISD::EXTRACT_VECTOR_ELT:
346 case ISD::INSERT_VECTOR_ELT:
347 case ISD::SCALAR_TO_VECTOR:
348 case ISD::IS_FPCLASS:
349 break;
350 case ISD::EXTRACT_SUBVECTOR:
351 case ISD::INSERT_SUBVECTOR:
352 case ISD::CONCAT_VECTORS:
353 setOperationAction(Op, VT, Action: Custom);
354 break;
355 default:
356 setOperationAction(Op, VT, Action: Expand);
357 break;
358 }
359 }
360 }
361
362 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand);
363
364 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
365 // is expanded to avoid having two separate loops in case the index is a VGPR.
366
367 // Most operations are naturally 32-bit vector operations. We only support
368 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
369 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
370 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
371 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
372
373 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
374 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
375
376 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
377 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
378
379 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
380 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
381 }
382
383 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
384 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
385 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
386
387 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
388 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
389
390 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
391 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
392
393 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
394 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
395 }
396
397 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
398 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
399 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
400
401 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
402 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
403
404 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
405 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
406
407 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
408 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
409 }
410
411 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
412 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
413 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
414
415 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
416 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
417
418 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
419 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
420
421 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
422 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
423 }
424
425 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
426 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
427 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
428
429 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
430 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
431
432 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
433 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
434
435 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
436 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
437 }
438
439 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
440 VTs: {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
441 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
442 Action: Custom);
443
444 if (Subtarget->hasPkMovB32()) {
445 // TODO: 16-bit element vectors should be legal with even aligned elements.
446 // TODO: Can be legal with wider source types than the result with
447 // subregister extracts.
448 setOperationAction(Ops: ISD::VECTOR_SHUFFLE, VTs: {MVT::v2i32, MVT::v2f32}, Action: Legal);
449 }
450
451 setOperationAction(Ops: {ISD::AND, ISD::OR, ISD::XOR}, VT: MVT::v2i32, Action: Legal);
452 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
453 // instead lower to cndmask in SITargetLowering::LowerSELECT().
454 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i32, Action: Custom);
455 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
456 // alignbit.
457 setOperationAction(Op: ISD::ROTR, VT: MVT::v2i32, Action: Custom);
458
459 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
460 Action: Custom);
461
462 // Avoid stack access for these.
463 // TODO: Generalize to more vector types.
464 setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
465 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
466 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
467 Action: Custom);
468
469 // Deal with vec3 vector operations when widened to vec4.
470 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
471 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom);
472
473 // Deal with vec5/6/7 vector operations when widened to vec8.
474 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
475 VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
476 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
477 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
478 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
479 Action: Custom);
480
481 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
482 // and output demarshalling
483 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom);
484
485 // We can't return success/failure, only the old value,
486 // let LLVM add the comparison
487 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64},
488 Action: Expand);
489
490 setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom);
491
492 setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal);
493
494 // FIXME: This should be narrowed to i32, but that only happens if i64 is
495 // illegal.
496 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
497 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal);
498
499 // On SI this is s_memtime and s_memrealtime on VI.
500 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
501
502 if (Subtarget->hasSMemRealTime() ||
503 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
504 setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal);
505 setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom);
506
507 if (Subtarget->has16BitInsts()) {
508 setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote);
509 setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom);
510 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Legal);
511 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Legal);
512 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f16, Action: Legal);
513 } else {
514 setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom);
515 }
516
517 if (Subtarget->hasMadMacF32Insts())
518 setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal);
519
520 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, VT: MVT::i32, Action: Custom);
521 setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_POISON}, VT: MVT::i32, Action: Custom);
522 setOperationAction(Op: ISD::CTLS, VT: MVT::i32, Action: Custom);
523
524 // We only really have 32-bit BFE instructions (and 16-bit on VI).
525 //
526 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
527 // effort to match them now. We want this to be false for i64 cases when the
528 // extraction isn't restricted to the upper or lower half. Ideally we would
529 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
530 // span the midpoint are probably relatively rare, so don't worry about them
531 // for now.
532 setHasExtractBitsInsn(true);
533
534 // Clamp modifier on add/sub
535 if (Subtarget->hasIntClamp())
536 setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal);
537
538 if (Subtarget->hasAddNoCarryInsts())
539 setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32},
540 Action: Legal);
541
542 setOperationAction(
543 Ops: {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
544 VTs: {MVT::f32, MVT::f64}, Action: Custom);
545
546 // These are really only legal for ieee_mode functions. We should be avoiding
547 // them for functions that don't have ieee_mode enabled, so just say they are
548 // legal.
549 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
550 VTs: {MVT::f32, MVT::f64}, Action: Legal);
551
552 if (Subtarget->haveRoundOpsF64())
553 setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64,
554 Action: Legal);
555 else
556 setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
557 VT: MVT::f64, Action: Custom);
558
559 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
560 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64},
561 Action: Legal);
562 setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom);
563
564 setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom);
565 setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom);
566
567 setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
568 setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
569
570 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i32,
571 Action: Custom);
572 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i16,
573 Action: Custom);
574 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i1,
575 Action: Custom);
576
577 // Custom lower these because we can't specify a rule based on an illegal
578 // source bf16.
579 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom);
580 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom);
581
582 if (Subtarget->has16BitInsts()) {
583 setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
584 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
585 VT: MVT::i16, Action: Legal);
586
587 AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32);
588
589 setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
590 VT: MVT::i16, Action: Expand);
591
592 setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
593 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
594 ISD::CTTZ_ZERO_POISON, ISD::CTLZ, ISD::CTLZ_ZERO_POISON,
595 ISD::CTPOP},
596 VT: MVT::i16, Action: Promote);
597
598 setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom);
599
600 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
601
602 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote);
603 AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32);
604 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote);
605 AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32);
606
607 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom);
608 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i32, Action: Custom);
609 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
610 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i1, Action: Custom);
611
612 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom);
613
614 // F16 - Constant Actions.
615 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
616 setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
617
618 // F16 - Load/Store Actions.
619 setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote);
620 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
621 setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote);
622 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
623
624 // BF16 - Load/Store Actions.
625 setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote);
626 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
627 setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote);
628 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
629
630 // F16 - VOP1 Actions.
631 setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
632 ISD::FSIN, ISD::FROUND},
633 VT: MVT::f16, Action: Custom);
634
635 // BF16 - VOP1 Actions.
636 if (Subtarget->hasBF16TransInsts())
637 setOperationAction(Ops: {ISD::FCOS, ISD::FSIN, ISD::FDIV}, VT: MVT::bf16, Action: Custom);
638
639 // F16 - VOP2 Actions.
640 setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16},
641 Action: Expand);
642 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom);
643 setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom);
644 setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom);
645
646 // F16 - VOP3 Actions.
647 setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal);
648 if (STI.hasMadF16())
649 setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal);
650
651 for (MVT VT :
652 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
653 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
654 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
655 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
656 switch (Op) {
657 case ISD::LOAD:
658 case ISD::STORE:
659 case ISD::BUILD_VECTOR:
660 case ISD::BITCAST:
661 case ISD::UNDEF:
662 case ISD::EXTRACT_VECTOR_ELT:
663 case ISD::INSERT_VECTOR_ELT:
664 case ISD::INSERT_SUBVECTOR:
665 case ISD::SCALAR_TO_VECTOR:
666 case ISD::IS_FPCLASS:
667 break;
668 case ISD::EXTRACT_SUBVECTOR:
669 case ISD::CONCAT_VECTORS:
670 case ISD::FSIN:
671 case ISD::FCOS:
672 setOperationAction(Op, VT, Action: Custom);
673 break;
674 default:
675 setOperationAction(Op, VT, Action: Expand);
676 break;
677 }
678 }
679 }
680
681 // v_perm_b32 can handle either of these.
682 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal);
683 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom);
684
685 // Legalize vector types for sat conversions to select v_cvt_pk_[iu]16_f32.
686 if (Subtarget->hasVCvtPkIU16F32())
687 setOperationAction(
688 Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT},
689 VTs: {MVT::v2i16, MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16},
690 Action: Custom);
691
692 // XXX - Do these do anything? Vector constants turn into build_vector.
693 setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal);
694
695 setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
696 Action: Legal);
697
698 setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote);
699 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
700 setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote);
701 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
702
703 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote);
704 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
705 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote);
706 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
707
708 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::v2i16, Action: Promote);
709 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
710 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::v2f16, Action: Promote);
711 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
712
713 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::v2i16, Action: Promote);
714 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
715 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::v2f16, Action: Promote);
716 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
717
718 setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote);
719 AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32);
720 setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote);
721 AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
722 setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote);
723 AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
724
725 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote);
726 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
727 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote);
728 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
729 setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote);
730 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
731
732 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::v4i16, Action: Promote);
733 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::v4i16, DestVT: MVT::i64);
734 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::v4f16, Action: Promote);
735 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::v4f16, DestVT: MVT::i64);
736
737 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::v4i16, Action: Promote);
738 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::v4i16, DestVT: MVT::i64);
739 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::v4f16, Action: Promote);
740 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::v4f16, DestVT: MVT::i64);
741
742 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
743 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
744 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
745 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
746 setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote);
747 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
748
749 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote);
750 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
751 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote);
752 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
753 setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote);
754 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
755
756 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
757 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
758 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
759 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
760
761 setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote);
762 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
763 setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote);
764 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
765 setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote);
766 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
767
768 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote);
769 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
770 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote);
771 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
772 setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote);
773 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
774
775 setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote);
776 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
777 setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote);
778 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
779 setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote);
780 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
781
782 setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote);
783 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
784 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote);
785 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
786 setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote);
787 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
788
789 setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote);
790 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
791 setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote);
792 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
793 setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote);
794 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
795
796 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
797 VT: MVT::v2i32, Action: Expand);
798 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand);
799
800 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
801 VT: MVT::v4i32, Action: Expand);
802
803 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
804 VT: MVT::v8i32, Action: Expand);
805
806 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
807 Action: Subtarget->hasVOP3PInsts() ? Legal : Custom);
808
809 setOperationAction(Ops: ISD::FNEG, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
810 // This isn't really legal, but this avoids the legalizer unrolling it (and
811 // allows matching fneg (fabs x) patterns)
812 setOperationAction(Ops: ISD::FABS, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
813
814 // Can do this in one BFI plus a constant materialize.
815 setOperationAction(Ops: ISD::FCOPYSIGN,
816 VTs: {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
817 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
818 MVT::v32f16, MVT::v32bf16},
819 Action: Custom);
820
821 setOperationAction(
822 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
823 VT: MVT::f16, Action: Custom);
824 setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal);
825
826 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
827 ISD::FMAXIMUMNUM},
828 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
829 Action: Custom);
830
831 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM},
832 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
833 Action: Expand);
834
835 for (MVT Vec16 :
836 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
837 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
838 setOperationAction(
839 Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
840 VT: Vec16, Action: Custom);
841 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand);
842 }
843 }
844
845 if (Subtarget->hasVOP3PInsts()) {
846 setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
847 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
848 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
849 VT: MVT::v2i16, Action: Legal);
850
851 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG, ISD::FABS,
852 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
853 ISD::FCANONICALIZE},
854 VT: MVT::v2f16, Action: Legal);
855
856 setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT,
857 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Action: Custom);
858
859 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
860 VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
861 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
862 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
863 Action: Custom);
864
865 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
866 // Split vector operations.
867 setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
868 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
869 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
870 ISD::SSUBSAT},
871 VT, Action: Custom);
872
873 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
874 // Split vector operations.
875 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG, ISD::FABS,
876 ISD::FCANONICALIZE},
877 VT, Action: Custom);
878
879 setOperationAction(
880 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
881 VTs: {MVT::v2f16, MVT::v4f16}, Action: Custom);
882
883 setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom);
884 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
885 Action: Custom);
886
887 if (Subtarget->hasBF16PackedInsts()) {
888 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMAXNUM, ISD::FMINNUM,
889 ISD::FMA, ISD::FNEG, ISD::FABS, ISD::FCANONICALIZE},
890 VT: MVT::v2bf16, Action: Legal);
891
892 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
893 // Split vector operations.
894 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE,
895 ISD::FNEG, ISD::FABS},
896 VT, Action: Custom);
897 }
898
899 if (Subtarget->hasPackedFP32Ops()) {
900 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
901 VT: MVT::v2f32, Action: Legal);
902 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
903 VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
904 Action: Custom);
905 }
906 if (Subtarget->hasPackedFP64Ops()) {
907 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG,
908 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
909 ISD::FCANONICALIZE, ISD::BUILD_VECTOR},
910 VT: MVT::v2f64, Action: Legal);
911 setOperationAction(
912 Ops: {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
913 VT: MVT::v2f64, Action: Custom);
914 setOperationAction(
915 Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG, ISD::FMINNUM_IEEE,
916 ISD::FMAXNUM_IEEE, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM,
917 ISD::FMAXIMUMNUM, ISD::FCANONICALIZE},
918 VTs: {MVT::v4f64, MVT::v8f64, MVT::v16f64, MVT::v32f64}, Action: Custom);
919 }
920
921 if (Subtarget->hasPackedU64Ops()) {
922 setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::SHL, ISD::BUILD_VECTOR},
923 VT: MVT::v2i64, Action: Legal);
924 setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::SHL},
925 VTs: {MVT::v4i64, MVT::v8i64, MVT::v16i64, MVT::v32i64},
926 Action: Custom);
927 }
928 }
929
930 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom);
931
932 if (Subtarget->has16BitInsts()) {
933 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote);
934 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32);
935 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote);
936 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32);
937 } else {
938 // Legalization hack.
939 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom);
940
941 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom);
942 }
943
944 setOperationAction(Ops: ISD::SELECT,
945 VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
946 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
947 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
948 MVT::v32f16, MVT::v32bf16},
949 Action: Custom);
950
951 setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom);
952
953 if (Subtarget->hasVMulU64Inst())
954 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Legal);
955 else if (Subtarget->hasScalarSMulU64())
956 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom);
957
958 if (Subtarget->hasMad64_32())
959 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom);
960
961 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
962 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
963
964 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
965 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM},
966 VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal);
967 } else {
968 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
969 if (Subtarget->hasMinimum3Maximum3F32())
970 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f32, Action: Legal);
971
972 if (Subtarget->hasMinimum3Maximum3PKF16()) {
973 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::v2f16, Action: Legal);
974
975 // If only the vector form is available, we need to widen to a vector.
976 if (!Subtarget->hasMinimum3Maximum3F16())
977 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f16, Action: Custom);
978 }
979 }
980
981 if (Subtarget->hasVOP3PInsts()) {
982 // We want to break these into v2f16 pieces, not scalarize.
983 setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM},
984 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
985 Action: Custom);
986 }
987
988 if (Subtarget->hasMinMaxI64Insts())
989 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i64,
990 Action: Legal);
991
992 setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN,
993 VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
994 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
995 MVT::i8},
996 Action: Custom);
997
998 setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN,
999 VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
1000 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
1001 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
1002 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
1003 Action: Custom);
1004
1005 setOperationAction(Ops: ISD::INTRINSIC_VOID,
1006 VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
1007 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
1008 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
1009 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
1010 Action: Custom);
1011
1012 setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom);
1013 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
1014 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
1015 setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom);
1016 setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom);
1017
1018 // TODO: Could move this to custom lowering, could benefit from combines on
1019 // extract of relevant bits.
1020 setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal);
1021
1022 setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote);
1023
1024 if (Subtarget->hasBF16ConversionInsts()) {
1025 setOperationAction(Ops: ISD::FP_ROUND, VTs: {MVT::bf16, MVT::v2bf16}, Action: Custom);
1026 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2bf16, Action: Legal);
1027 }
1028
1029 if (Subtarget->hasBF16TransInsts()) {
1030 setOperationAction(Ops: {ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, VT: MVT::bf16, Action: Legal);
1031 }
1032
1033 if (Subtarget->hasCvtPkF16F32Inst()) {
1034 setOperationAction(Ops: ISD::FP_ROUND,
1035 VTs: {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1036 Action: Custom);
1037 }
1038
1039 setTargetDAGCombine({ISD::ADD,
1040 ISD::PTRADD,
1041 ISD::SUB,
1042 ISD::MUL,
1043 ISD::FADD,
1044 ISD::FSUB,
1045 ISD::FDIV,
1046 ISD::FMUL,
1047 ISD::FMINNUM,
1048 ISD::FMAXNUM,
1049 ISD::FMINNUM_IEEE,
1050 ISD::FMAXNUM_IEEE,
1051 ISD::FMINIMUM,
1052 ISD::FMAXIMUM,
1053 ISD::FMINIMUMNUM,
1054 ISD::FMAXIMUMNUM,
1055 ISD::FMA,
1056 ISD::ABS,
1057 ISD::SMIN,
1058 ISD::SMAX,
1059 ISD::UMIN,
1060 ISD::UMAX,
1061 ISD::SETCC,
1062 ISD::SELECT,
1063 ISD::SMIN,
1064 ISD::SMAX,
1065 ISD::UMIN,
1066 ISD::UMAX,
1067 ISD::USUBSAT,
1068 ISD::AND,
1069 ISD::OR,
1070 ISD::XOR,
1071 ISD::SHL,
1072 ISD::SRL,
1073 ISD::SRA,
1074 ISD::FSHR,
1075 ISD::SINT_TO_FP,
1076 ISD::UINT_TO_FP,
1077 ISD::FCANONICALIZE,
1078 ISD::SCALAR_TO_VECTOR,
1079 ISD::ZERO_EXTEND,
1080 ISD::SIGN_EXTEND_INREG,
1081 ISD::ANY_EXTEND,
1082 ISD::EXTRACT_VECTOR_ELT,
1083 ISD::INSERT_VECTOR_ELT,
1084 ISD::FCOPYSIGN});
1085
1086 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1087 setTargetDAGCombine(ISD::FP_ROUND);
1088
1089 // All memory operations. Some folding on the pointer operand is done to help
1090 // matching the constant offsets in the addressing modes.
1091 setTargetDAGCombine({ISD::LOAD,
1092 ISD::STORE,
1093 ISD::ATOMIC_LOAD,
1094 ISD::ATOMIC_STORE,
1095 ISD::ATOMIC_CMP_SWAP,
1096 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1097 ISD::ATOMIC_SWAP,
1098 ISD::ATOMIC_LOAD_ADD,
1099 ISD::ATOMIC_LOAD_SUB,
1100 ISD::ATOMIC_LOAD_AND,
1101 ISD::ATOMIC_LOAD_OR,
1102 ISD::ATOMIC_LOAD_XOR,
1103 ISD::ATOMIC_LOAD_NAND,
1104 ISD::ATOMIC_LOAD_MIN,
1105 ISD::ATOMIC_LOAD_MAX,
1106 ISD::ATOMIC_LOAD_UMIN,
1107 ISD::ATOMIC_LOAD_UMAX,
1108 ISD::ATOMIC_LOAD_FADD,
1109 ISD::ATOMIC_LOAD_FMIN,
1110 ISD::ATOMIC_LOAD_FMAX,
1111 ISD::ATOMIC_LOAD_UINC_WRAP,
1112 ISD::ATOMIC_LOAD_UDEC_WRAP,
1113 ISD::ATOMIC_LOAD_USUB_COND,
1114 ISD::ATOMIC_LOAD_USUB_SAT,
1115 ISD::INTRINSIC_VOID,
1116 ISD::INTRINSIC_W_CHAIN});
1117
1118 // FIXME: In other contexts we pretend this is a per-function property.
1119 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1120
1121 setSchedulingPreference(Sched::RegPressure);
1122}
1123
1124const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1125
1126ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1127 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1128 return RCRegs;
1129}
1130
1131//===----------------------------------------------------------------------===//
1132// TargetLowering queries
1133//===----------------------------------------------------------------------===//
1134
1135// v_mad_mix* support a conversion from f16 to f32.
1136//
1137// There is only one special case when denormals are enabled we don't currently,
1138// where this is OK to use.
1139bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1140 EVT DestVT, EVT SrcVT) const {
1141 return DestVT.getScalarType() == MVT::f32 &&
1142 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1143 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1144 SrcVT.getScalarType() == MVT::f16) ||
1145 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1146 SrcVT.getScalarType() == MVT::bf16)) &&
1147 // TODO: This probably only requires no input flushing?
1148 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
1149}
1150
1151bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1152 LLT DestTy, LLT SrcTy) const {
1153 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1154 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1155 DestTy.getScalarSizeInBits() == 32 &&
1156 SrcTy.getScalarSizeInBits() == 16 &&
1157 // TODO: This probably only requires no input flushing?
1158 denormalModeIsFlushAllF32(MF: *MI.getMF());
1159}
1160
1161bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1162 // SI has some legal vector types, but no legal vector operations. Say no
1163 // shuffles are legal in order to prefer scalarizing some vector operations.
1164 return false;
1165}
1166
1167MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1168 CallingConv::ID CC,
1169 EVT VT) const {
1170 if (CC == CallingConv::AMDGPU_KERNEL)
1171 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1172
1173 if (VT.isVector()) {
1174 EVT ScalarVT = VT.getScalarType();
1175 unsigned Size = ScalarVT.getSizeInBits();
1176 if (Size == 16) {
1177 return Subtarget->has16BitInsts()
1178 ? MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), NumElements: 2)
1179 : MVT::i32;
1180 }
1181
1182 if (Size < 16)
1183 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1184 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1185 }
1186
1187 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1188 return MVT::i32;
1189
1190 if (VT.getSizeInBits() > 32)
1191 return MVT::i32;
1192
1193 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1194}
1195
1196unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1197 CallingConv::ID CC,
1198 EVT VT) const {
1199 if (CC == CallingConv::AMDGPU_KERNEL)
1200 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1201
1202 if (VT.isVector()) {
1203 unsigned NumElts = VT.getVectorNumElements();
1204 EVT ScalarVT = VT.getScalarType();
1205 unsigned Size = ScalarVT.getSizeInBits();
1206
1207 // FIXME: Should probably promote 8-bit vectors to i16.
1208 if (Size == 16)
1209 return (NumElts + 1) / 2;
1210
1211 if (Size <= 32)
1212 return NumElts;
1213
1214 if (Size > 32)
1215 return NumElts * ((Size + 31) / 32);
1216 } else if (VT.getSizeInBits() > 32)
1217 return (VT.getSizeInBits() + 31) / 32;
1218
1219 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1220}
1221
1222unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1223 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1224 unsigned &NumIntermediates, MVT &RegisterVT) const {
1225 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1226 unsigned NumElts = VT.getVectorNumElements();
1227 EVT ScalarVT = VT.getScalarType();
1228 unsigned Size = ScalarVT.getSizeInBits();
1229 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1230 // support, but unless we can properly handle 3-vectors, it will be still be
1231 // inconsistent.
1232 if (Size == 16) {
1233 MVT SimpleIntermediateVT =
1234 MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), EC: ElementCount::getFixed(MinVal: 2));
1235 IntermediateVT = SimpleIntermediateVT;
1236 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1237 NumIntermediates = (NumElts + 1) / 2;
1238 return (NumElts + 1) / 2;
1239 }
1240
1241 if (Size == 32) {
1242 RegisterVT = ScalarVT.getSimpleVT();
1243 IntermediateVT = RegisterVT;
1244 NumIntermediates = NumElts;
1245 return NumIntermediates;
1246 }
1247
1248 if (Size < 16 && Subtarget->has16BitInsts()) {
1249 // FIXME: Should probably form v2i16 pieces
1250 RegisterVT = MVT::i16;
1251 IntermediateVT = ScalarVT;
1252 NumIntermediates = NumElts;
1253 return NumIntermediates;
1254 }
1255
1256 if (Size != 16 && Size <= 32) {
1257 RegisterVT = MVT::i32;
1258 IntermediateVT = ScalarVT;
1259 NumIntermediates = NumElts;
1260 return NumIntermediates;
1261 }
1262
1263 if (Size > 32) {
1264 RegisterVT = MVT::i32;
1265 IntermediateVT = RegisterVT;
1266 NumIntermediates = NumElts * ((Size + 31) / 32);
1267 return NumIntermediates;
1268 }
1269 }
1270
1271 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1272 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1273}
1274
1275static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1276 const DataLayout &DL, Type *Ty,
1277 unsigned MaxNumLanes) {
1278 assert(MaxNumLanes != 0);
1279
1280 LLVMContext &Ctx = Ty->getContext();
1281 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1282 unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1283 return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()),
1284 NumElements: NumElts);
1285 }
1286
1287 return TLI.getValueType(DL, Ty);
1288}
1289
1290// Peek through TFE struct returns to only use the data size.
1291static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1292 const DataLayout &DL, Type *Ty,
1293 unsigned MaxNumLanes) {
1294 auto *ST = dyn_cast<StructType>(Val: Ty);
1295 if (!ST)
1296 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1297
1298 // TFE intrinsics return an aggregate type.
1299 assert(ST->getNumContainedTypes() == 2 &&
1300 ST->getContainedType(1)->isIntegerTy(32));
1301 return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: 0), MaxNumLanes);
1302}
1303
1304/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1305/// in-memory representation. This return value is a custom type because there
1306/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1307/// could cause issues during codegen, these address space 7 pointers will be
1308/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1309/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1310/// for cost modeling, to work. (This also sets us up decently for doing the
1311/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1312MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1313 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1314 return MVT::amdgpuBufferFatPointer;
1315 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1316 DL.getPointerSizeInBits(AS) == 192)
1317 return MVT::amdgpuBufferStridedPointer;
1318 return AMDGPUTargetLowering::getPointerTy(DL, AS);
1319}
1320/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1321/// v8i32 when padding is added.
1322/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1323/// also v8i32 with padding.
1324MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1325 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1326 DL.getPointerSizeInBits(AS) == 160) ||
1327 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1328 DL.getPointerSizeInBits(AS) == 192))
1329 return MVT::v8i32;
1330 return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1331}
1332
1333static unsigned getIntrMemWidth(unsigned IntrID) {
1334 switch (IntrID) {
1335 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1336 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1337 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1338 return 8;
1339 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1340 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1341 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1342 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1343 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1344 case Intrinsic::amdgcn_flat_load_monitor_b32:
1345 case Intrinsic::amdgcn_global_load_monitor_b32:
1346 return 32;
1347 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1348 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1349 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1350 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1351 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1352 case Intrinsic::amdgcn_flat_load_monitor_b64:
1353 case Intrinsic::amdgcn_global_load_monitor_b64:
1354 return 64;
1355 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1356 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1357 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1358 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1359 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1360 case Intrinsic::amdgcn_flat_load_monitor_b128:
1361 case Intrinsic::amdgcn_global_load_monitor_b128:
1362 return 128;
1363 default:
1364 llvm_unreachable("Unknown width");
1365 }
1366}
1367
1368static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI,
1369 unsigned ArgIdx) {
1370 Value *OrderingArg = CI.getArgOperand(i: ArgIdx);
1371 unsigned Ord = cast<ConstantInt>(Val: OrderingArg)->getZExtValue();
1372 switch (AtomicOrderingCABI(Ord)) {
1373 case AtomicOrderingCABI::acquire:
1374 return AtomicOrdering::Acquire;
1375 break;
1376 case AtomicOrderingCABI::release:
1377 return AtomicOrdering::Release;
1378 break;
1379 case AtomicOrderingCABI::seq_cst:
1380 return AtomicOrdering::SequentiallyConsistent;
1381 break;
1382 default:
1383 return AtomicOrdering::Monotonic;
1384 }
1385}
1386
1387static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1388 MDNode *ScopeMD = cast<MDNode>(
1389 Val: cast<MetadataAsValue>(Val: CI.getArgOperand(i: ArgIdx))->getMetadata());
1390 StringRef Scope = cast<MDString>(Val: ScopeMD->getOperand(I: 0))->getString();
1391 return CI.getContext().getOrInsertSyncScopeID(SSN: Scope);
1392}
1393
1394void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
1395 const CallBase &CI,
1396 MachineFunction &MF,
1397 unsigned IntrID) const {
1398 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
1399 if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1400 Flags |= MachineMemOperand::MOInvariant;
1401 if (CI.hasMetadata(KindID: LLVMContext::MD_nontemporal))
1402 Flags |= MachineMemOperand::MONonTemporal;
1403 Flags |= getTargetMMOFlags(I: CI);
1404
1405 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1406 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1407 AttributeSet Attr =
1408 Intrinsic::getFnAttributes(C&: CI.getContext(), id: (Intrinsic::ID)IntrID);
1409 MemoryEffects ME = Attr.getMemoryEffects();
1410 if (ME.doesNotAccessMemory())
1411 return;
1412
1413 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1414 if (!IsSPrefetch) {
1415 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1416 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1417 Flags |= MachineMemOperand::MOVolatile;
1418 }
1419 Flags |= MachineMemOperand::MODereferenceable;
1420
1421 IntrinsicInfo Info;
1422 // TODO: Should images get their own address space?
1423 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1424
1425 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1426 if (RsrcIntr->IsImage) {
1427 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1428 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1429 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1430 Info.align.reset();
1431 }
1432
1433 Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1434 if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1435 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1436 // We conservatively set the memory operand of a buffer intrinsic to the
1437 // base resource pointer, so that we can access alias information about
1438 // those pointers. Cases like "this points at the same value
1439 // but with a different offset" are handled in
1440 // areMemAccessesTriviallyDisjoint.
1441 Info.ptrVal = RsrcArg;
1442 }
1443
1444 if (ME.onlyReadsMemory()) {
1445 if (RsrcIntr->IsImage) {
1446 unsigned MaxNumLanes = 4;
1447
1448 if (!BaseOpcode->Gather4) {
1449 // If this isn't a gather, we may have excess loaded elements in the
1450 // IR type. Check the dmask for the real number of elements loaded.
1451 unsigned DMask =
1452 cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue();
1453 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1454 }
1455
1456 Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(),
1457 Ty: CI.getType(), MaxNumLanes);
1458 } else {
1459 Info.memVT =
1460 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1461 MaxNumLanes: std::numeric_limits<unsigned>::max());
1462 }
1463
1464 // FIXME: What does alignment mean for an image?
1465 Info.opc = ISD::INTRINSIC_W_CHAIN;
1466 Info.flags = Flags | MachineMemOperand::MOLoad;
1467 } else if (ME.onlyWritesMemory()) {
1468 Info.opc = ISD::INTRINSIC_VOID;
1469
1470 Type *DataTy = CI.getArgOperand(i: 0)->getType();
1471 if (RsrcIntr->IsImage) {
1472 unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue();
1473 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1474 Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy,
1475 MaxNumLanes: DMaskLanes);
1476 } else
1477 Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy);
1478
1479 Info.flags = Flags | MachineMemOperand::MOStore;
1480 } else {
1481 // Atomic, NoReturn Sampler or prefetch
1482 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1483 : ISD::INTRINSIC_W_CHAIN;
1484
1485 switch (IntrID) {
1486 default:
1487 Info.flags = Flags | MachineMemOperand::MOLoad;
1488 if (!IsSPrefetch)
1489 Info.flags |= MachineMemOperand::MOStore;
1490
1491 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1492 // Fake memory access type for no return sampler intrinsics
1493 Info.memVT = MVT::i32;
1494 } else {
1495 // XXX - Should this be volatile without known ordering?
1496 Info.flags |= MachineMemOperand::MOVolatile;
1497 Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType());
1498 }
1499 break;
1500 case Intrinsic::amdgcn_raw_buffer_load_lds:
1501 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1502 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1503 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1504 case Intrinsic::amdgcn_struct_buffer_load_lds:
1505 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1506 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1507 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1508 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1509
1510 // Entry 0: Load from buffer.
1511 // Don't set an offset, since the pointer value always represents the
1512 // base of the buffer.
1513 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1514 Info.flags = Flags | MachineMemOperand::MOLoad;
1515 Infos.push_back(Elt: Info);
1516
1517 // Entry 1: Store to LDS.
1518 // Instruction offset is applied, and an additional per-lane offset
1519 // which we simulate using a larger memory type.
1520 Info.memVT = EVT::getIntegerVT(
1521 Context&: CI.getContext(), BitWidth: Width * 8 * Subtarget->getWavefrontSize());
1522 Info.ptrVal = CI.getArgOperand(i: 1); // LDS destination pointer
1523 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 2))
1524 ->getZExtValue();
1525 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1526 Info.flags = Flags | MachineMemOperand::MOStore;
1527 Infos.push_back(Elt: Info);
1528 return;
1529 }
1530 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1531 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1532 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1533 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1534 Info.memVT =
1535 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1536 MaxNumLanes: std::numeric_limits<unsigned>::max());
1537 Info.flags = Flags | MachineMemOperand::MOLoad;
1538 Infos.push_back(Elt: Info);
1539 return;
1540 }
1541 }
1542 }
1543 Infos.push_back(Elt: Info);
1544 return;
1545 }
1546
1547 IntrinsicInfo Info;
1548 switch (IntrID) {
1549 case Intrinsic::amdgcn_ds_ordered_add:
1550 case Intrinsic::amdgcn_ds_ordered_swap: {
1551 Info.opc = ISD::INTRINSIC_W_CHAIN;
1552 Info.memVT = MVT::getVT(Ty: CI.getType());
1553 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1554 Info.align.reset();
1555 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1556
1557 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4));
1558 if (!Vol->isZero())
1559 Info.flags |= MachineMemOperand::MOVolatile;
1560
1561 Infos.push_back(Elt: Info);
1562 return;
1563 }
1564 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1565 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1566 Info.opc = ISD::INTRINSIC_W_CHAIN;
1567 Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType());
1568 Info.ptrVal = nullptr;
1569 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1570 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1571 Infos.push_back(Elt: Info);
1572 return;
1573 }
1574 case Intrinsic::amdgcn_ds_append:
1575 case Intrinsic::amdgcn_ds_consume: {
1576 Info.opc = ISD::INTRINSIC_W_CHAIN;
1577 Info.memVT = MVT::getVT(Ty: CI.getType());
1578 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1579 Info.align.reset();
1580 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1581
1582 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1));
1583 if (!Vol->isZero())
1584 Info.flags |= MachineMemOperand::MOVolatile;
1585
1586 Infos.push_back(Elt: Info);
1587 return;
1588 }
1589 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1590 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1591 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1592 ? ISD::INTRINSIC_W_CHAIN
1593 : ISD::INTRINSIC_VOID;
1594 Info.memVT = MVT::getVT(Ty: CI.getType());
1595 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1596 Info.memVT = MVT::i64;
1597 Info.size = 8;
1598 Info.align.reset();
1599 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1600 Info.order = AtomicOrdering::Monotonic;
1601 Infos.push_back(Elt: Info);
1602 return;
1603 }
1604 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1605 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1606 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1607 Info.opc = ISD::INTRINSIC_W_CHAIN;
1608 Info.memVT =
1609 MVT::getVT(Ty: IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1610 ? CI.getType()
1611 : cast<StructType>(Val: CI.getType())
1612 ->getElementType(N: 0)); // XXX: what is correct VT?
1613
1614 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1615 Info.align.reset();
1616 Info.flags = Flags | MachineMemOperand::MOLoad |
1617 MachineMemOperand::MODereferenceable;
1618 Infos.push_back(Elt: Info);
1619 return;
1620 }
1621 case Intrinsic::amdgcn_global_atomic_fmin_num:
1622 case Intrinsic::amdgcn_global_atomic_fmax_num:
1623 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1624 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1625 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1626 Info.opc = ISD::INTRINSIC_W_CHAIN;
1627 Info.memVT = MVT::getVT(Ty: CI.getType());
1628 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1629 Info.align.reset();
1630 Info.flags =
1631 Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1632 MachineMemOperand::MODereferenceable | MachineMemOperand::MOVolatile;
1633 Infos.push_back(Elt: Info);
1634 return;
1635 }
1636 case Intrinsic::amdgcn_cluster_load_b32:
1637 case Intrinsic::amdgcn_cluster_load_b64:
1638 case Intrinsic::amdgcn_cluster_load_b128:
1639 case Intrinsic::amdgcn_ds_load_tr6_b96:
1640 case Intrinsic::amdgcn_ds_load_tr4_b64:
1641 case Intrinsic::amdgcn_ds_load_tr8_b64:
1642 case Intrinsic::amdgcn_ds_load_tr16_b128:
1643 case Intrinsic::amdgcn_global_load_tr6_b96:
1644 case Intrinsic::amdgcn_global_load_tr4_b64:
1645 case Intrinsic::amdgcn_global_load_tr_b64:
1646 case Intrinsic::amdgcn_global_load_tr_b128:
1647 case Intrinsic::amdgcn_ds_read_tr4_b64:
1648 case Intrinsic::amdgcn_ds_read_tr6_b96:
1649 case Intrinsic::amdgcn_ds_read_tr8_b64:
1650 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1651 Info.opc = ISD::INTRINSIC_W_CHAIN;
1652 Info.memVT = MVT::getVT(Ty: CI.getType());
1653 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1654 Info.align.reset();
1655 Info.flags = Flags | MachineMemOperand::MOLoad;
1656 Infos.push_back(Elt: Info);
1657 return;
1658 }
1659 case Intrinsic::amdgcn_flat_load_monitor_b32:
1660 case Intrinsic::amdgcn_flat_load_monitor_b64:
1661 case Intrinsic::amdgcn_flat_load_monitor_b128:
1662 case Intrinsic::amdgcn_global_load_monitor_b32:
1663 case Intrinsic::amdgcn_global_load_monitor_b64:
1664 case Intrinsic::amdgcn_global_load_monitor_b128: {
1665 Info.opc = ISD::INTRINSIC_W_CHAIN;
1666 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1667 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1668 Info.align.reset();
1669 Info.flags = MachineMemOperand::MOLoad;
1670 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 1);
1671 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 2);
1672 Infos.push_back(Elt: Info);
1673 return;
1674 }
1675 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1676 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1677 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1678 Info.opc = ISD::INTRINSIC_W_CHAIN;
1679 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1680 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1681 Info.align.reset();
1682 Info.flags = (MachineMemOperand::MOLoad | MOCooperative);
1683 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 1);
1684 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 2);
1685 Infos.push_back(Elt: Info);
1686 return;
1687 }
1688 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1689 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1690 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1691 Info.opc = ISD::INTRINSIC_VOID;
1692 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1693 Info.ptrVal = CI.getArgOperand(i: 0);
1694 Info.align.reset();
1695 Info.flags = (MachineMemOperand::MOStore | MOCooperative);
1696 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 2);
1697 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 3);
1698 Infos.push_back(Elt: Info);
1699 return;
1700 }
1701 case Intrinsic::amdgcn_ds_gws_init:
1702 case Intrinsic::amdgcn_ds_gws_barrier:
1703 case Intrinsic::amdgcn_ds_gws_sema_v:
1704 case Intrinsic::amdgcn_ds_gws_sema_br:
1705 case Intrinsic::amdgcn_ds_gws_sema_p:
1706 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1707 Info.opc = ISD::INTRINSIC_VOID;
1708
1709 const GCNTargetMachine &TM =
1710 static_cast<const GCNTargetMachine &>(getTargetMachine());
1711
1712 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1713 Info.ptrVal = MFI->getGWSPSV(TM);
1714
1715 // This is an abstract access, but we need to specify a type and size.
1716 Info.memVT = MVT::i32;
1717 Info.size = 4;
1718 Info.align = Align(4);
1719
1720 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1721 Info.flags = Flags | MachineMemOperand::MOLoad;
1722 else
1723 Info.flags = Flags | MachineMemOperand::MOStore;
1724 Infos.push_back(Elt: Info);
1725 return;
1726 }
1727 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1728 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1729 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1730 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1731 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1732 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1733 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1734 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1735 // Entry 0: Load from source (global/flat).
1736 Info.opc = ISD::INTRINSIC_VOID;
1737 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1738 Info.ptrVal = CI.getArgOperand(i: 0); // Global pointer
1739 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getSExtValue();
1740 Info.flags = Flags | MachineMemOperand::MOLoad;
1741 Infos.push_back(Elt: Info);
1742
1743 // Entry 1: Store to LDS (same offset).
1744 Info.flags = Flags | MachineMemOperand::MOStore;
1745 Info.ptrVal = CI.getArgOperand(i: 1); // LDS pointer
1746 Infos.push_back(Elt: Info);
1747 return;
1748 }
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1751 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1753 // Entry 0: Load from LDS.
1754 Info.opc = ISD::INTRINSIC_VOID;
1755 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1756 Info.ptrVal = CI.getArgOperand(i: 1); // LDS pointer
1757 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getSExtValue();
1758 Info.flags = Flags | MachineMemOperand::MOLoad;
1759 Infos.push_back(Elt: Info);
1760
1761 // Entry 1: Store to global (same offset).
1762 Info.flags = Flags | MachineMemOperand::MOStore;
1763 Info.ptrVal = CI.getArgOperand(i: 0); // Global pointer
1764 Infos.push_back(Elt: Info);
1765 return;
1766 }
1767 case Intrinsic::amdgcn_av_load_b128:
1768 case Intrinsic::amdgcn_av_store_b128: {
1769 bool IsStore = IntrID == Intrinsic::amdgcn_av_store_b128;
1770 Info.opc = IsStore ? ISD::INTRINSIC_VOID : ISD::INTRINSIC_W_CHAIN;
1771 Info.memVT = MVT::v4i32;
1772 Info.ptrVal = CI.getArgOperand(i: 0);
1773 Info.align = Align(16);
1774 Info.flags |=
1775 IsStore ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad;
1776 // Pretend to be atomic so that SIMemoryLegalizer::expandStore sets cache
1777 // flags appropriately.
1778 Info.order = AtomicOrdering::Monotonic;
1779
1780 LLVMContext &Ctx = CI.getContext();
1781 unsigned ScopeIdx = CI.arg_size() - 1;
1782 MDNode *ScopeMD = cast<MDNode>(
1783 Val: cast<MetadataAsValue>(Val: CI.getArgOperand(i: ScopeIdx))->getMetadata());
1784 StringRef Scope = cast<MDString>(Val: ScopeMD->getOperand(I: 0))->getString();
1785 Info.ssid = Ctx.getOrInsertSyncScopeID(SSN: Scope);
1786 Infos.push_back(Elt: Info);
1787 return;
1788 }
1789 case Intrinsic::amdgcn_load_to_lds:
1790 case Intrinsic::amdgcn_load_async_to_lds:
1791 case Intrinsic::amdgcn_global_load_lds:
1792 case Intrinsic::amdgcn_global_load_async_lds: {
1793 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1794 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1795 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1796 if (IsVolatile)
1797 Flags |= MachineMemOperand::MOVolatile;
1798
1799 // Entry 0: Load from source (global/flat).
1800 Info.opc = ISD::INTRINSIC_VOID;
1801 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1802 Info.ptrVal = CI.getArgOperand(i: 0); // Source pointer
1803 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 3))->getSExtValue();
1804 Info.flags = Flags | MachineMemOperand::MOLoad;
1805 Infos.push_back(Elt: Info);
1806
1807 // Entry 1: Store to LDS.
1808 // Same offset from the instruction, but an additional per-lane offset is
1809 // added. Represent that using a wider memory type.
1810 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(),
1811 BitWidth: Width * 8 * Subtarget->getWavefrontSize());
1812 Info.ptrVal = CI.getArgOperand(i: 1); // LDS destination pointer
1813 Info.flags = Flags | MachineMemOperand::MOStore;
1814 Infos.push_back(Elt: Info);
1815 return;
1816 }
1817 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1818 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1819 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1820 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1821 Info.opc = ISD::INTRINSIC_W_CHAIN;
1822
1823 const GCNTargetMachine &TM =
1824 static_cast<const GCNTargetMachine &>(getTargetMachine());
1825
1826 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1827 Info.ptrVal = MFI->getGWSPSV(TM);
1828
1829 // This is an abstract access, but we need to specify a type and size.
1830 Info.memVT = MVT::i32;
1831 Info.size = 4;
1832 Info.align = Align(4);
1833
1834 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1835 Infos.push_back(Elt: Info);
1836 return;
1837 }
1838 case Intrinsic::amdgcn_s_prefetch_data:
1839 case Intrinsic::amdgcn_s_prefetch_inst:
1840 case Intrinsic::amdgcn_flat_prefetch:
1841 case Intrinsic::amdgcn_global_prefetch: {
1842 Info.opc = ISD::INTRINSIC_VOID;
1843 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: 8);
1844 Info.ptrVal = CI.getArgOperand(i: 0);
1845 Info.flags = Flags | MachineMemOperand::MOLoad;
1846 Infos.push_back(Elt: Info);
1847 return;
1848 }
1849 default:
1850 return;
1851 }
1852}
1853
1854void SITargetLowering::CollectTargetIntrinsicOperands(
1855 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1856 switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1857 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1858 // The DAG's ValueType loses the addrspaces.
1859 // Add them as 2 extra Constant operands "from" and "to".
1860 unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1861 unsigned DstAS = I.getType()->getPointerAddressSpace();
1862 Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc(), VT: MVT::i32));
1863 Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc(), VT: MVT::i32));
1864 break;
1865 }
1866 default:
1867 break;
1868 }
1869}
1870
1871bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1872 SmallVectorImpl<Value *> &Ops,
1873 Type *&AccessTy) const {
1874 Value *Ptr = nullptr;
1875 switch (II->getIntrinsicID()) {
1876 case Intrinsic::amdgcn_cluster_load_b128:
1877 case Intrinsic::amdgcn_cluster_load_b64:
1878 case Intrinsic::amdgcn_cluster_load_b32:
1879 case Intrinsic::amdgcn_ds_append:
1880 case Intrinsic::amdgcn_ds_consume:
1881 case Intrinsic::amdgcn_ds_load_tr8_b64:
1882 case Intrinsic::amdgcn_ds_load_tr16_b128:
1883 case Intrinsic::amdgcn_ds_load_tr4_b64:
1884 case Intrinsic::amdgcn_ds_load_tr6_b96:
1885 case Intrinsic::amdgcn_ds_read_tr4_b64:
1886 case Intrinsic::amdgcn_ds_read_tr6_b96:
1887 case Intrinsic::amdgcn_ds_read_tr8_b64:
1888 case Intrinsic::amdgcn_ds_read_tr16_b64:
1889 case Intrinsic::amdgcn_ds_ordered_add:
1890 case Intrinsic::amdgcn_ds_ordered_swap:
1891 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1892 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1893 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1894 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1895 case Intrinsic::amdgcn_global_atomic_fmax_num:
1896 case Intrinsic::amdgcn_global_atomic_fmin_num:
1897 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1898 case Intrinsic::amdgcn_global_load_tr_b64:
1899 case Intrinsic::amdgcn_global_load_tr_b128:
1900 case Intrinsic::amdgcn_global_load_tr4_b64:
1901 case Intrinsic::amdgcn_global_load_tr6_b96:
1902 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1903 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1904 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1905 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1906 case Intrinsic::amdgcn_av_load_b128:
1907 case Intrinsic::amdgcn_av_store_b128:
1908 Ptr = II->getArgOperand(i: 0);
1909 break;
1910 case Intrinsic::amdgcn_load_to_lds:
1911 case Intrinsic::amdgcn_load_async_to_lds:
1912 case Intrinsic::amdgcn_global_load_lds:
1913 case Intrinsic::amdgcn_global_load_async_lds:
1914 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1915 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1916 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1917 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1918 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1919 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1920 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1921 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1922 Ptr = II->getArgOperand(i: 1);
1923 break;
1924 default:
1925 return false;
1926 }
1927 AccessTy = II->getType();
1928 Ops.push_back(Elt: Ptr);
1929 return true;
1930}
1931
1932bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1933 unsigned AddrSpace) const {
1934 if (!Subtarget->hasFlatInstOffsets()) {
1935 // Flat instructions do not have offsets, and only have the register
1936 // address.
1937 return AM.BaseOffs == 0 && AM.Scale == 0;
1938 }
1939
1940 using AMDGPU::FlatAddrSpace;
1941 FlatAddrSpace FlatVariant =
1942 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? FlatAddrSpace::FlatGlobal
1943 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? FlatAddrSpace::FlatScratch
1944 : FlatAddrSpace::FLAT;
1945
1946 return AM.Scale == 0 &&
1947 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1948 Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1949}
1950
1951bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1952 if (Subtarget->hasFlatGlobalInsts())
1953 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS);
1954
1955 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1956 // Assume the we will use FLAT for all global memory accesses
1957 // on VI.
1958 // FIXME: This assumption is currently wrong. On VI we still use
1959 // MUBUF instructions for the r + i addressing mode. As currently
1960 // implemented, the MUBUF instructions only work on buffer < 4GB.
1961 // It may be possible to support > 4GB buffers with MUBUF instructions,
1962 // by setting the stride value in the resource descriptor which would
1963 // increase the size limit to (stride * 4GB). However, this is risky,
1964 // because it has never been validated.
1965 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1966 }
1967
1968 return isLegalMUBUFAddressingMode(AM);
1969}
1970
1971bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1972 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1973 // additionally can do r + r + i with addr64. 32-bit has more addressing
1974 // mode options. Depending on the resource constant, it can also do
1975 // (i64 r0) + (i32 r1) * (i14 i).
1976 //
1977 // Private arrays end up using a scratch buffer most of the time, so also
1978 // assume those use MUBUF instructions. Scratch loads / stores are currently
1979 // implemented as mubuf instructions with offen bit set, so slightly
1980 // different than the normal addr64.
1981 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1982 if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1983 return false;
1984
1985 // FIXME: Since we can split immediate into soffset and immediate offset,
1986 // would it make sense to allow any immediate?
1987
1988 switch (AM.Scale) {
1989 case 0: // r + i or just i, depending on HasBaseReg.
1990 return true;
1991 case 1:
1992 return true; // We have r + r or r + i.
1993 case 2:
1994 if (AM.HasBaseReg) {
1995 // Reject 2 * r + r.
1996 return false;
1997 }
1998
1999 // Allow 2 * r as r + r
2000 // Or 2 * r + i is allowed as r + r + i.
2001 return true;
2002 default: // Don't allow n * r
2003 return false;
2004 }
2005}
2006
2007bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
2008 const AddrMode &AM, Type *Ty,
2009 unsigned AS,
2010 Instruction *I) const {
2011 // No global is ever allowed as a base.
2012 if (AM.BaseGV)
2013 return false;
2014
2015 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
2016 return isLegalGlobalAddressingMode(AM);
2017
2018 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
2019 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
2020 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
2021 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2022 // If the offset isn't a multiple of 4, it probably isn't going to be
2023 // correctly aligned.
2024 // FIXME: Can we get the real alignment here?
2025 if (AM.BaseOffs % 4 != 0)
2026 return isLegalMUBUFAddressingMode(AM);
2027
2028 if (!Subtarget->hasScalarSubwordLoads()) {
2029 // There are no SMRD extloads, so if we have to do a small type access we
2030 // will use a MUBUF load.
2031 // FIXME?: We also need to do this if unaligned, but we don't know the
2032 // alignment here.
2033 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
2034 return isLegalGlobalAddressingMode(AM);
2035 }
2036
2037 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
2038 // SMRD instructions have an 8-bit, dword offset on SI.
2039 if (!isUInt<8>(x: AM.BaseOffs / 4))
2040 return false;
2041 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
2042 // On CI+, this can also be a 32-bit literal constant offset. If it fits
2043 // in 8-bits, it can use a smaller encoding.
2044 if (!isUInt<32>(x: AM.BaseOffs / 4))
2045 return false;
2046 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
2047 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
2048 if (!isUInt<20>(x: AM.BaseOffs))
2049 return false;
2050 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
2051 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
2052 // for S_BUFFER_* instructions).
2053 if (!isInt<21>(x: AM.BaseOffs))
2054 return false;
2055 } else {
2056 // On GFX12, all offsets are signed 24-bit in bytes.
2057 if (!isInt<24>(x: AM.BaseOffs))
2058 return false;
2059 }
2060
2061 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
2062 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
2063 AM.BaseOffs < 0) {
2064 // Scalar (non-buffer) loads can only use a negative offset if
2065 // soffset+offset is non-negative. Since the compiler can only prove that
2066 // in a few special cases, it is safer to claim that negative offsets are
2067 // not supported.
2068 return false;
2069 }
2070
2071 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2072 return true;
2073
2074 if (AM.Scale == 1 && AM.HasBaseReg)
2075 return true;
2076
2077 return false;
2078 }
2079
2080 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2081 return Subtarget->hasFlatScratchEnabled()
2082 ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)
2083 : isLegalMUBUFAddressingMode(AM);
2084
2085 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2086 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2087 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2088 // field.
2089 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2090 // an 8-bit dword offset but we don't know the alignment here.
2091 if (!isUInt<16>(x: AM.BaseOffs))
2092 return false;
2093
2094 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2095 return true;
2096
2097 if (AM.Scale == 1 && AM.HasBaseReg)
2098 return true;
2099
2100 return false;
2101 }
2102
2103 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
2104 // For an unknown address space, this usually means that this is for some
2105 // reason being used for pure arithmetic, and not based on some addressing
2106 // computation. We don't have instructions that compute pointers with any
2107 // addressing modes, so treat them as having no offset like flat
2108 // instructions.
2109 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
2110 }
2111
2112 // Assume a user alias of global for unknown address spaces.
2113 return isLegalGlobalAddressingMode(AM);
2114}
2115
2116bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
2117 const MachineFunction &MF) const {
2118 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
2119 return (MemVT.getSizeInBits() <= 4 * 32);
2120 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2121 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2122 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2123 }
2124 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
2125 return (MemVT.getSizeInBits() <= 2 * 32);
2126 return true;
2127}
2128
2129bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
2130 unsigned Size, unsigned AddrSpace, Align Alignment,
2131 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2132 if (IsFast)
2133 *IsFast = 0;
2134
2135 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2136 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2137 // Check if alignment requirements for ds_read/write instructions are
2138 // disabled.
2139 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2140 return false;
2141
2142 Align RequiredAlignment(
2143 PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))); // Natural alignment.
2144 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2145 Alignment < RequiredAlignment)
2146 return false;
2147
2148 // Either, the alignment requirements are "enabled", or there is an
2149 // unaligned LDS access related hardware bug though alignment requirements
2150 // are "disabled". In either case, we need to check for proper alignment
2151 // requirements.
2152 //
2153 switch (Size) {
2154 case 64:
2155 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2156 // address is negative, then the instruction is incorrectly treated as
2157 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2158 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2159 // load later in the SILoadStoreOptimizer.
2160 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2161 return false;
2162
2163 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2164 // can do a 4 byte aligned, 8 byte access in a single operation using
2165 // ds_read2/write2_b32 with adjacent offsets.
2166 RequiredAlignment = Align(4);
2167
2168 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2169 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2170 // ds_write2_b32 depending on the alignment. In either case with either
2171 // alignment there is no faster way of doing this.
2172
2173 // The numbers returned here and below are not additive, it is a 'speed
2174 // rank'. They are just meant to be compared to decide if a certain way
2175 // of lowering an operation is faster than another. For that purpose
2176 // naturally aligned operation gets it bitsize to indicate that "it
2177 // operates with a speed comparable to N-bit wide load". With the full
2178 // alignment ds128 is slower than ds96 for example. If underaligned it
2179 // is comparable to a speed of a single dword access, which would then
2180 // mean 32 < 128 and it is faster to issue a wide load regardless.
2181 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2182 // wider load which will not be aligned anymore the latter is slower.
2183 if (IsFast)
2184 *IsFast = (Alignment >= RequiredAlignment) ? 64
2185 : (Alignment < Align(4)) ? 32
2186 : 1;
2187 return true;
2188 }
2189
2190 break;
2191 case 96:
2192 if (!Subtarget->hasDS96AndDS128())
2193 return false;
2194
2195 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2196 // gfx8 and older.
2197
2198 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2199 // Naturally aligned access is fastest. However, also report it is Fast
2200 // if memory is aligned less than DWORD. A narrow load or store will be
2201 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2202 // be more of them, so overall we will pay less penalty issuing a single
2203 // instruction.
2204
2205 // See comment on the values above.
2206 if (IsFast)
2207 *IsFast = (Alignment >= RequiredAlignment) ? 96
2208 : (Alignment < Align(4)) ? 32
2209 : 1;
2210 return true;
2211 }
2212
2213 break;
2214 case 128:
2215 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2216 return false;
2217
2218 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2219 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2220 // single operation using ds_read2/write2_b64.
2221 RequiredAlignment = Align(8);
2222
2223 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2224 // Naturally aligned access is fastest. However, also report it is Fast
2225 // if memory is aligned less than DWORD. A narrow load or store will be
2226 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2227 // will be more of them, so overall we will pay less penalty issuing a
2228 // single instruction.
2229
2230 // See comment on the values above.
2231 if (IsFast)
2232 *IsFast = (Alignment >= RequiredAlignment) ? 128
2233 : (Alignment < Align(4)) ? 32
2234 : 1;
2235 return true;
2236 }
2237
2238 break;
2239 default:
2240 if (Size > 32)
2241 return false;
2242
2243 break;
2244 }
2245
2246 // See comment on the values above.
2247 // Note that we have a single-dword or sub-dword here, so if underaligned
2248 // it is a slowest possible access, hence returned value is 0.
2249 if (IsFast)
2250 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2251
2252 return Alignment >= RequiredAlignment ||
2253 Subtarget->hasUnalignedDSAccessEnabled();
2254 }
2255
2256 // FIXME: We have to be conservative here and assume that flat operations
2257 // will access scratch. If we had access to the IR function, then we
2258 // could determine if any private memory was used in the function.
2259 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2260 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2261 bool AlignedBy4 = Alignment >= Align(4);
2262 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2263 if (IsFast)
2264 *IsFast = AlignedBy4 ? Size : 1;
2265 return true;
2266 }
2267
2268 if (IsFast)
2269 *IsFast = AlignedBy4;
2270
2271 return AlignedBy4;
2272 }
2273
2274 // So long as they are correct, wide global memory operations perform better
2275 // than multiple smaller memory ops -- even when misaligned
2276 if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
2277 if (IsFast)
2278 *IsFast = Size;
2279
2280 return Alignment >= Align(4) ||
2281 Subtarget->hasUnalignedBufferAccessEnabled();
2282 }
2283
2284 // Ensure robust out-of-bounds guarantees for buffer accesses are met when the
2285 // "amdgpu.buffer.oob.mode" module flag has not enabled relaxed untyped-buffer
2286 // OOB semantics. Normally hardware will ensure proper
2287 // out-of-bounds behavior, but in the edge case where an access starts
2288 // out-of-bounds and then enters in-bounds, the entire access would be treated
2289 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2290 // natural alignment of buffer accesses.
2291 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2292 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2293 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2294 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2295 Alignment < Align(PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))))
2296 return false;
2297 }
2298
2299 // Smaller than dword value must be aligned.
2300 if (Size < 32)
2301 return false;
2302
2303 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2304 // byte-address are ignored, thus forcing Dword alignment.
2305 // This applies to private, global, and constant memory.
2306 if (IsFast)
2307 *IsFast = 1;
2308
2309 return Size >= 32 && Alignment >= Align(4);
2310}
2311
2312bool SITargetLowering::allowsMisalignedMemoryAccesses(
2313 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2314 unsigned *IsFast) const {
2315 return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
2316 Alignment, Flags, IsFast);
2317}
2318
2319EVT SITargetLowering::getOptimalMemOpType(
2320 LLVMContext &Context, const MemOp &Op,
2321 const AttributeList &FuncAttributes) const {
2322 // FIXME: Should account for address space here.
2323
2324 // The default fallback uses the private pointer size as a guess for a type to
2325 // use. Make sure we switch these to 64-bit accesses.
2326
2327 if (Op.size() >= 16 &&
2328 Op.isDstAligned(AlignCheck: Align(4))) // XXX: Should only do for global
2329 return MVT::v4i32;
2330
2331 if (Op.size() >= 8 && Op.isDstAligned(AlignCheck: Align(4)))
2332 return MVT::v2i32;
2333
2334 // Use the default.
2335 return MVT::Other;
2336}
2337
2338bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
2339 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
2340 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2341}
2342
2343bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2344 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2345 AS == AMDGPUAS::PRIVATE_ADDRESS;
2346}
2347
2348bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2349 unsigned DestAS) const {
2350 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2351 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2352 Subtarget->hasGloballyAddressableScratch()) {
2353 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2354 return false;
2355 }
2356
2357 // Flat -> private/local is a simple truncate.
2358 // Flat -> global is no-op
2359 return true;
2360 }
2361
2362 const GCNTargetMachine &TM =
2363 static_cast<const GCNTargetMachine &>(getTargetMachine());
2364 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2365}
2366
2367TargetLoweringBase::LegalizeTypeAction
2368SITargetLowering::getPreferredVectorAction(MVT VT) const {
2369 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2370 VT.getScalarType().bitsLE(VT: MVT::i16))
2371 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2372 return TargetLoweringBase::getPreferredVectorAction(VT);
2373}
2374
2375bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2376 Type *Ty) const {
2377 // FIXME: Could be smarter if called for vector constants.
2378 return true;
2379}
2380
2381bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2382 unsigned Index) const {
2383 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
2384 return false;
2385
2386 // TODO: Add more cases that are cheap.
2387 return Index == 0;
2388}
2389
2390bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2391 // TODO: This should be more aggressive, particular for 16-bit element
2392 // vectors. However there are some mixed improvements and regressions.
2393 EVT EltTy = VT.getVectorElementType();
2394 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2395 return EltTy.getSizeInBits() % MinAlign == 0;
2396}
2397
2398bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2399 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2400 switch (Op) {
2401 case ISD::LOAD:
2402 case ISD::STORE:
2403 return true;
2404 default:
2405 return false;
2406 }
2407 }
2408
2409 // SimplifySetCC uses this function to determine whether or not it should
2410 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2411 if (VT == MVT::i1 && Op == ISD::SETCC)
2412 return false;
2413
2414 return TargetLowering::isTypeDesirableForOp(Op, VT);
2415}
2416
2417MachinePointerInfo
2418SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
2419 // This isn't really a constant pool but close enough.
2420 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
2421 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
2422 return PtrInfo;
2423}
2424
2425SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2426 const SDLoc &SL,
2427 SDValue Chain,
2428 uint64_t Offset) const {
2429 const DataLayout &DL = DAG.getDataLayout();
2430 MachineFunction &MF = DAG.getMachineFunction();
2431 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2432 MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
2433
2434 auto [InputPtrReg, RC, ArgTy] =
2435 Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2436
2437 // We may not have the kernarg segment argument if we have no kernel
2438 // arguments.
2439 if (!InputPtrReg)
2440 return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
2441
2442 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2443 SDValue BasePtr = DAG.getCopyFromReg(
2444 Chain, dl: SL, Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
2445
2446 return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
2447}
2448
2449SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2450 const SDLoc &SL) const {
2451 uint64_t Offset =
2452 getImplicitParameterOffset(MF: DAG.getMachineFunction(), Param: FIRST_IMPLICIT);
2453 return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
2454}
2455
2456SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2457 const SDLoc &SL) const {
2458
2459 Function &F = DAG.getMachineFunction().getFunction();
2460 std::optional<uint32_t> KnownSize =
2461 AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(F);
2462 if (KnownSize.has_value())
2463 return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32);
2464 return SDValue();
2465}
2466
2467SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2468 const SDLoc &SL, SDValue Val,
2469 bool Signed,
2470 const ISD::InputArg *Arg) const {
2471 // First, if it is a widened vector, narrow it.
2472 if (VT.isVector() &&
2473 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2474 EVT NarrowedVT =
2475 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
2476 NumElements: VT.getVectorNumElements());
2477 Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val,
2478 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
2479 }
2480
2481 // Then convert the vector elements or scalar value.
2482 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(VT: MemVT)) {
2483 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2484 Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2485 }
2486
2487 if (MemVT.isFloatingPoint()) {
2488 if (VT.isFloatingPoint()) {
2489 Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2490 } else {
2491 assert(!MemVT.isVector());
2492 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
2493 SDValue Cast = DAG.getBitcast(VT: IntVT, V: Val);
2494 Val = DAG.getAnyExtOrTrunc(Op: Cast, DL: SL, VT);
2495 }
2496 } else if (Signed)
2497 Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2498 else
2499 Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2500
2501 return Val;
2502}
2503
2504SDValue SITargetLowering::lowerKernargMemParameter(
2505 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2506 uint64_t Offset, Align Alignment, bool Signed,
2507 const ISD::InputArg *Arg) const {
2508
2509 MachinePointerInfo PtrInfo =
2510 getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
2511
2512 // Try to avoid using an extload by loading earlier than the argument address,
2513 // and extracting the relevant bits. The load should hopefully be merged with
2514 // the previous argument.
2515 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2516 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2517 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2518 int64_t OffsetDiff = Offset - AlignDownOffset;
2519
2520 EVT IntVT = MemVT.changeTypeToInteger();
2521
2522 // TODO: If we passed in the base kernel offset we could have a better
2523 // alignment than 4, but we don't really need it.
2524 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2525 SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr,
2526 PtrInfo: PtrInfo.getWithOffset(O: AlignDownOffset), Alignment: Align(4),
2527 MMOFlags: MachineMemOperand::MODereferenceable |
2528 MachineMemOperand::MOInvariant);
2529
2530 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL: SL, VT: MVT::i32);
2531 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt);
2532
2533 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2534 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2535 ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2536
2537 return DAG.getMergeValues(Ops: {ArgVal, Load.getValue(R: 1)}, dl: SL);
2538 }
2539
2540 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2541 SDValue Load = DAG.getLoad(
2542 VT: MemVT, dl: SL, Chain, Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
2543 MMOFlags: MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
2544
2545 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2546 return DAG.getMergeValues(Ops: {Val, Load.getValue(R: 1)}, dl: SL);
2547}
2548
2549/// Coerce an argument which was passed in a different ABI type to the original
2550/// expected value type.
2551SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2552 SDValue Val,
2553 CCValAssign &VA,
2554 const SDLoc &SL) const {
2555 EVT ValVT = VA.getValVT();
2556
2557 // If this is an 8 or 16-bit value, it is really passed promoted
2558 // to 32 bits. Insert an assert[sz]ext to capture this, then
2559 // truncate to the right size.
2560 switch (VA.getLocInfo()) {
2561 case CCValAssign::Full:
2562 return Val;
2563 case CCValAssign::BCvt:
2564 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ValVT, Operand: Val);
2565 case CCValAssign::SExt:
2566 Val = DAG.getNode(Opcode: ISD::AssertSext, DL: SL, VT: VA.getLocVT(), N1: Val,
2567 N2: DAG.getValueType(ValVT));
2568 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2569 case CCValAssign::ZExt:
2570 Val = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: VA.getLocVT(), N1: Val,
2571 N2: DAG.getValueType(ValVT));
2572 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2573 case CCValAssign::AExt:
2574 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2575 default:
2576 llvm_unreachable("Unknown loc info!");
2577 }
2578}
2579
2580SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2581 CCValAssign &VA, const SDLoc &SL,
2582 SDValue Chain,
2583 const ISD::InputArg &Arg) const {
2584 MachineFunction &MF = DAG.getMachineFunction();
2585 MachineFrameInfo &MFI = MF.getFrameInfo();
2586
2587 if (Arg.Flags.isByVal()) {
2588 unsigned Size = Arg.Flags.getByValSize();
2589 int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2590 return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32);
2591 }
2592
2593 unsigned ArgOffset = VA.getLocMemOffset();
2594 unsigned ArgSize = VA.getValVT().getStoreSize();
2595
2596 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2597
2598 // Create load nodes to retrieve arguments from the stack.
2599 SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32);
2600
2601 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2602 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2603 MVT MemVT = VA.getValVT();
2604
2605 switch (VA.getLocInfo()) {
2606 default:
2607 break;
2608 case CCValAssign::BCvt:
2609 MemVT = VA.getLocVT();
2610 break;
2611 case CCValAssign::SExt:
2612 ExtType = ISD::SEXTLOAD;
2613 break;
2614 case CCValAssign::ZExt:
2615 ExtType = ISD::ZEXTLOAD;
2616 break;
2617 case CCValAssign::AExt:
2618 ExtType = ISD::EXTLOAD;
2619 break;
2620 }
2621
2622 SDValue ArgValue = DAG.getExtLoad(
2623 ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2624 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), MemVT);
2625
2626 SDValue ConvertedVal = convertABITypeToValueType(DAG, Val: ArgValue, VA, SL);
2627 if (ConvertedVal == ArgValue)
2628 return ConvertedVal;
2629
2630 return DAG.getMergeValues(Ops: {ConvertedVal, ArgValue.getValue(R: 1)}, dl: SL);
2631}
2632
2633SDValue SITargetLowering::lowerWorkGroupId(
2634 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2635 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
2636 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
2637 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2638 if (!Subtarget->hasClusters())
2639 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2640
2641 // Clusters are supported. Return the global position in the grid. If clusters
2642 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2643
2644 // WorkGroupIdXYZ = ClusterId == 0 ?
2645 // ClusterIdXYZ :
2646 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2647 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2648 SDLoc SL(ClusterIdXYZ);
2649 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2650 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT);
2651 SDValue ClusterSizeXYZ = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterMaxIdXYZ, N2: One);
2652 SDValue ClusterWorkGroupIdXYZ =
2653 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2654 SDValue GlobalIdXYZ =
2655 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterWorkGroupIdXYZ,
2656 N2: DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: ClusterIdXYZ, N2: ClusterSizeXYZ));
2657
2658 switch (MFI.getClusterDims().getKind()) {
2659 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
2660 case AMDGPU::ClusterDimsAttr::Kind::VariableDims:
2661 return GlobalIdXYZ;
2662 case AMDGPU::ClusterDimsAttr::Kind::NoCluster:
2663 return ClusterIdXYZ;
2664 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
2665 using namespace AMDGPU::Hwreg;
2666 SDValue ClusterIdField =
2667 DAG.getTargetConstant(Val: HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4), DL: SL, VT);
2668 SDNode *GetReg =
2669 DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT, Op1: ClusterIdField);
2670 SDValue ClusterId(GetReg, 0);
2671 SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT);
2672 return DAG.getNode(Opcode: ISD::SELECT_CC, DL: SL, VT, N1: ClusterId, N2: Zero, N3: ClusterIdXYZ,
2673 N4: GlobalIdXYZ, N5: DAG.getCondCode(Cond: ISD::SETEQ));
2674 }
2675 }
2676
2677 llvm_unreachable("nothing should reach here");
2678}
2679
2680SDValue SITargetLowering::getPreloadedValue(
2681 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2682 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2683 const ArgDescriptor *Reg = nullptr;
2684 const TargetRegisterClass *RC;
2685 LLT Ty;
2686
2687 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2688 const ArgDescriptor WorkGroupIDX =
2689 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
2690 // If GridZ is not programmed in an entry function then the hardware will set
2691 // it to all zeros, so there is no need to mask the GridY value in the low
2692 // order bits.
2693 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2694 Reg: AMDGPU::TTMP7,
2695 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2696 const ArgDescriptor WorkGroupIDZ =
2697 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
2698 const ArgDescriptor ClusterWorkGroupIDX =
2699 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
2700 const ArgDescriptor ClusterWorkGroupIDY =
2701 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
2702 const ArgDescriptor ClusterWorkGroupIDZ =
2703 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
2704 const ArgDescriptor ClusterWorkGroupMaxIDX =
2705 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
2706 const ArgDescriptor ClusterWorkGroupMaxIDY =
2707 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
2708 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2709 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
2710 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2711 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
2712
2713 auto LoadConstant = [&](unsigned N) {
2714 return DAG.getConstant(Val: N, DL: SDLoc(), VT);
2715 };
2716
2717 if (Subtarget->hasArchitectedSGPRs() &&
2718 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2719 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2720 bool HasFixedDims = ClusterDims.isFixedDims();
2721
2722 switch (PVID) {
2723 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2724 Reg = &WorkGroupIDX;
2725 RC = &AMDGPU::SReg_32RegClass;
2726 Ty = LLT::scalar(SizeInBits: 32);
2727 break;
2728 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2729 Reg = &WorkGroupIDY;
2730 RC = &AMDGPU::SReg_32RegClass;
2731 Ty = LLT::scalar(SizeInBits: 32);
2732 break;
2733 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2734 Reg = &WorkGroupIDZ;
2735 RC = &AMDGPU::SReg_32RegClass;
2736 Ty = LLT::scalar(SizeInBits: 32);
2737 break;
2738 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
2739 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2740 return LoadConstant(0);
2741 Reg = &ClusterWorkGroupIDX;
2742 RC = &AMDGPU::SReg_32RegClass;
2743 Ty = LLT::scalar(SizeInBits: 32);
2744 break;
2745 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
2746 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2747 return LoadConstant(0);
2748 Reg = &ClusterWorkGroupIDY;
2749 RC = &AMDGPU::SReg_32RegClass;
2750 Ty = LLT::scalar(SizeInBits: 32);
2751 break;
2752 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
2753 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2754 return LoadConstant(0);
2755 Reg = &ClusterWorkGroupIDZ;
2756 RC = &AMDGPU::SReg_32RegClass;
2757 Ty = LLT::scalar(SizeInBits: 32);
2758 break;
2759 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
2760 if (HasFixedDims)
2761 return LoadConstant(ClusterDims.getDims()[0] - 1);
2762 Reg = &ClusterWorkGroupMaxIDX;
2763 RC = &AMDGPU::SReg_32RegClass;
2764 Ty = LLT::scalar(SizeInBits: 32);
2765 break;
2766 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
2767 if (HasFixedDims)
2768 return LoadConstant(ClusterDims.getDims()[1] - 1);
2769 Reg = &ClusterWorkGroupMaxIDY;
2770 RC = &AMDGPU::SReg_32RegClass;
2771 Ty = LLT::scalar(SizeInBits: 32);
2772 break;
2773 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
2774 if (HasFixedDims)
2775 return LoadConstant(ClusterDims.getDims()[2] - 1);
2776 Reg = &ClusterWorkGroupMaxIDZ;
2777 RC = &AMDGPU::SReg_32RegClass;
2778 Ty = LLT::scalar(SizeInBits: 32);
2779 break;
2780 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
2781 Reg = &ClusterWorkGroupMaxFlatID;
2782 RC = &AMDGPU::SReg_32RegClass;
2783 Ty = LLT::scalar(SizeInBits: 32);
2784 break;
2785 default:
2786 break;
2787 }
2788 }
2789
2790 if (!Reg)
2791 std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2792 if (!Reg) {
2793 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2794 // It's possible for a kernarg intrinsic call to appear in a kernel with
2795 // no allocated segment, in which case we do not add the user sgpr
2796 // argument, so just return null.
2797 return DAG.getConstant(Val: 0, DL: SDLoc(), VT);
2798 }
2799
2800 // It's undefined behavior if a function marked with the amdgpu-no-*
2801 // attributes uses the corresponding intrinsic.
2802 return DAG.getPOISON(VT);
2803 }
2804
2805 return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg);
2806}
2807
2808static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2809 CallingConv::ID CallConv,
2810 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2811 FunctionType *FType,
2812 SIMachineFunctionInfo *Info) {
2813 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2814 const ISD::InputArg *Arg = &Ins[I];
2815
2816 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2817 "vector type argument should have been split");
2818
2819 // First check if it's a PS input addr.
2820 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2821 PSInputNum <= 15) {
2822 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2823
2824 // Inconveniently only the first part of the split is marked as isSplit,
2825 // so skip to the end. We only want to increment PSInputNum once for the
2826 // entire split argument.
2827 if (Arg->Flags.isSplit()) {
2828 while (!Arg->Flags.isSplitEnd()) {
2829 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2830 "unexpected vector split in ps argument type");
2831 if (!SkipArg)
2832 Splits.push_back(Elt: *Arg);
2833 Arg = &Ins[++I];
2834 }
2835 }
2836
2837 if (SkipArg) {
2838 // We can safely skip PS inputs.
2839 Skipped.set(Arg->getOrigArgIndex());
2840 ++PSInputNum;
2841 continue;
2842 }
2843
2844 Info->markPSInputAllocated(Index: PSInputNum);
2845 if (Arg->Used)
2846 Info->markPSInputEnabled(Index: PSInputNum);
2847
2848 ++PSInputNum;
2849 }
2850
2851 Splits.push_back(Elt: *Arg);
2852 }
2853}
2854
2855// Allocate special inputs passed in VGPRs.
2856void SITargetLowering::allocateSpecialEntryInputVGPRs(
2857 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2858 SIMachineFunctionInfo &Info) const {
2859 const LLT S32 = LLT::scalar(SizeInBits: 32);
2860 MachineRegisterInfo &MRI = MF.getRegInfo();
2861
2862 if (Info.hasWorkItemIDX()) {
2863 Register Reg = AMDGPU::VGPR0;
2864 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2865
2866 CCInfo.AllocateReg(Reg);
2867 unsigned Mask =
2868 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2869 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2870 }
2871
2872 if (Info.hasWorkItemIDY()) {
2873 assert(Info.hasWorkItemIDX());
2874 if (Subtarget->hasPackedTID()) {
2875 Info.setWorkItemIDY(
2876 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 10));
2877 } else {
2878 unsigned Reg = AMDGPU::VGPR1;
2879 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2880
2881 CCInfo.AllocateReg(Reg);
2882 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2883 }
2884 }
2885
2886 if (Info.hasWorkItemIDZ()) {
2887 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2888 if (Subtarget->hasPackedTID()) {
2889 Info.setWorkItemIDZ(
2890 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 20));
2891 } else {
2892 unsigned Reg = AMDGPU::VGPR2;
2893 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2894
2895 CCInfo.AllocateReg(Reg);
2896 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2897 }
2898 }
2899}
2900
2901// Try to allocate a VGPR at the end of the argument list, or if no argument
2902// VGPRs are left allocating a stack slot.
2903// If \p Mask is is given it indicates bitfield position in the register.
2904// If \p Arg is given use it with new ]p Mask instead of allocating new.
2905static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2906 ArgDescriptor Arg = ArgDescriptor()) {
2907 if (Arg.isSet())
2908 return ArgDescriptor::createArg(Arg, Mask);
2909
2910 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2911 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2912 if (RegIdx == ArgVGPRs.size()) {
2913 // Spill to stack required.
2914 int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
2915
2916 return ArgDescriptor::createStack(Offset, Mask);
2917 }
2918
2919 unsigned Reg = ArgVGPRs[RegIdx];
2920 Reg = CCInfo.AllocateReg(Reg);
2921 assert(Reg != AMDGPU::NoRegister);
2922
2923 MachineFunction &MF = CCInfo.getMachineFunction();
2924 Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass);
2925 MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32));
2926 return ArgDescriptor::createRegister(Reg, Mask);
2927}
2928
2929static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2930 const TargetRegisterClass *RC,
2931 unsigned NumArgRegs) {
2932 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2933 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2934 if (RegIdx == ArgSGPRs.size())
2935 report_fatal_error(reason: "ran out of SGPRs for arguments");
2936
2937 unsigned Reg = ArgSGPRs[RegIdx];
2938 Reg = CCInfo.AllocateReg(Reg);
2939 assert(Reg != AMDGPU::NoRegister);
2940
2941 MachineFunction &MF = CCInfo.getMachineFunction();
2942 MF.addLiveIn(PReg: Reg, RC);
2943 return ArgDescriptor::createRegister(Reg);
2944}
2945
2946// If this has a fixed position, we still should allocate the register in the
2947// CCInfo state. Technically we could get away with this for values passed
2948// outside of the normal argument range.
2949static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2950 const TargetRegisterClass *RC,
2951 MCRegister Reg) {
2952 Reg = CCInfo.AllocateReg(Reg);
2953 assert(Reg != AMDGPU::NoRegister);
2954 MachineFunction &MF = CCInfo.getMachineFunction();
2955 MF.addLiveIn(PReg: Reg, RC);
2956}
2957
2958static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2959 if (Arg) {
2960 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass,
2961 Reg: Arg.getRegister());
2962 } else
2963 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: 32);
2964}
2965
2966static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2967 if (Arg) {
2968 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass,
2969 Reg: Arg.getRegister());
2970 } else
2971 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: 16);
2972}
2973
2974/// Allocate implicit function VGPR arguments at the end of allocated user
2975/// arguments.
2976void SITargetLowering::allocateSpecialInputVGPRs(
2977 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2978 SIMachineFunctionInfo &Info) const {
2979 const unsigned Mask = 0x3ff;
2980 ArgDescriptor Arg;
2981
2982 if (Info.hasWorkItemIDX()) {
2983 Arg = allocateVGPR32Input(CCInfo, Mask);
2984 Info.setWorkItemIDX(Arg);
2985 }
2986
2987 if (Info.hasWorkItemIDY()) {
2988 Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg);
2989 Info.setWorkItemIDY(Arg);
2990 }
2991
2992 if (Info.hasWorkItemIDZ())
2993 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg));
2994}
2995
2996/// Allocate implicit function VGPR arguments in fixed registers.
2997void SITargetLowering::allocateSpecialInputVGPRsFixed(
2998 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2999 SIMachineFunctionInfo &Info) const {
3000 Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31);
3001 if (!Reg)
3002 report_fatal_error(reason: "failed to allocate VGPR for implicit arguments");
3003
3004 const unsigned Mask = 0x3ff;
3005 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
3006 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10));
3007 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20));
3008}
3009
3010void SITargetLowering::allocateSpecialInputSGPRs(
3011 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
3012 SIMachineFunctionInfo &Info) const {
3013 auto &ArgInfo = Info.getArgInfo();
3014 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
3015
3016 // TODO: Unify handling with private memory pointers.
3017 if (UserSGPRInfo.hasDispatchPtr())
3018 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
3019
3020 if (UserSGPRInfo.hasQueuePtr())
3021 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
3022
3023 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
3024 // constant offset from the kernarg segment.
3025 if (Info.hasImplicitArgPtr())
3026 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
3027
3028 if (UserSGPRInfo.hasDispatchID())
3029 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
3030
3031 // flat_scratch_init is not applicable for non-kernel functions.
3032
3033 if (Info.hasWorkGroupIDX())
3034 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
3035
3036 if (Info.hasWorkGroupIDY())
3037 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
3038
3039 if (Info.hasWorkGroupIDZ())
3040 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
3041
3042 if (Info.hasLDSKernelId())
3043 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
3044}
3045
3046// Allocate special inputs passed in user SGPRs.
3047void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
3048 MachineFunction &MF,
3049 const SIRegisterInfo &TRI,
3050 SIMachineFunctionInfo &Info) const {
3051 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
3052 if (UserSGPRInfo.hasImplicitBufferPtr()) {
3053 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
3054 MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
3055 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
3056 }
3057
3058 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
3059 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
3060 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
3061 MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
3062 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
3063 }
3064
3065 if (UserSGPRInfo.hasDispatchPtr()) {
3066 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
3067 MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
3068 CCInfo.AllocateReg(Reg: DispatchPtrReg);
3069 }
3070
3071 if (UserSGPRInfo.hasQueuePtr()) {
3072 Register QueuePtrReg = Info.addQueuePtr(TRI);
3073 MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
3074 CCInfo.AllocateReg(Reg: QueuePtrReg);
3075 }
3076
3077 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3078 MachineRegisterInfo &MRI = MF.getRegInfo();
3079 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3080 CCInfo.AllocateReg(Reg: InputPtrReg);
3081
3082 Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass);
3083 MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
3084 }
3085
3086 if (UserSGPRInfo.hasDispatchID()) {
3087 Register DispatchIDReg = Info.addDispatchID(TRI);
3088 MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
3089 CCInfo.AllocateReg(Reg: DispatchIDReg);
3090 }
3091
3092 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3093 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3094 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
3095 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
3096 }
3097
3098 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3099 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3100 MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
3101 CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
3102 }
3103
3104 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3105 // these from the dispatch pointer.
3106}
3107
3108// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3109// sequential starting from the first argument.
3110void SITargetLowering::allocatePreloadKernArgSGPRs(
3111 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3112 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
3113 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3114 Function &F = MF.getFunction();
3115 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3116 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3117 bool InPreloadSequence = true;
3118 unsigned InIdx = 0;
3119 bool AlignedForImplictArgs = false;
3120 unsigned ImplicitArgOffset = 0;
3121 for (auto &Arg : F.args()) {
3122 if (!InPreloadSequence || !Arg.hasInRegAttr())
3123 break;
3124
3125 unsigned ArgIdx = Arg.getArgNo();
3126 // Don't preload non-original args or parts not in the current preload
3127 // sequence.
3128 if (InIdx < Ins.size() &&
3129 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3130 break;
3131
3132 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3133 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3134 InIdx++) {
3135 assert(ArgLocs[ArgIdx].isMemLoc());
3136 auto &ArgLoc = ArgLocs[InIdx];
3137 const Align KernelArgBaseAlign = Align(16);
3138 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3139 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
3140 unsigned NumAllocSGPRs =
3141 alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32;
3142
3143 // Fix alignment for hidden arguments.
3144 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
3145 if (!AlignedForImplictArgs) {
3146 ImplicitArgOffset =
3147 alignTo(Size: LastExplicitArgOffset,
3148 A: Subtarget->getAlignmentForImplicitArgPtr()) -
3149 LastExplicitArgOffset;
3150 AlignedForImplictArgs = true;
3151 }
3152 ArgOffset += ImplicitArgOffset;
3153 }
3154
3155 // Arg is preloaded into the previous SGPR.
3156 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3157 assert(InIdx >= 1 && "No previous SGPR");
3158 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3159 Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3160 continue;
3161 }
3162
3163 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3164 unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4;
3165 // Check for free user SGPRs for preloading.
3166 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3167 InPreloadSequence = false;
3168 break;
3169 }
3170
3171 // Preload this argument.
3172 const TargetRegisterClass *RC =
3173 TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32);
3174 SmallVectorImpl<MCRegister> *PreloadRegs =
3175 Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
3176
3177 if (PreloadRegs->size() > 1)
3178 RC = &AMDGPU::SGPR_32RegClass;
3179 for (auto &Reg : *PreloadRegs) {
3180 assert(Reg);
3181 MF.addLiveIn(PReg: Reg, RC);
3182 CCInfo.AllocateReg(Reg);
3183 }
3184
3185 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3186 }
3187 }
3188}
3189
3190void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
3191 const SIRegisterInfo &TRI,
3192 SIMachineFunctionInfo &Info) const {
3193 // Always allocate this last since it is a synthetic preload.
3194 if (Info.hasLDSKernelId()) {
3195 Register Reg = Info.addLDSKernelId();
3196 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3197 CCInfo.AllocateReg(Reg);
3198 }
3199}
3200
3201// Allocate special input registers that are initialized per-wave.
3202void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
3203 SIMachineFunctionInfo &Info,
3204 CallingConv::ID CallConv,
3205 bool IsShader) const {
3206 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3207 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3208 // Note: user SGPRs are handled by the front-end for graphics shaders
3209 // Pad up the used user SGPRs with dead inputs.
3210
3211 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3212 // before enabling architected SGPRs for workgroup IDs.
3213 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3214
3215 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3216 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3217 // rely on it to reach 16 since if we end up having no stack usage, it will
3218 // not really be added.
3219 unsigned NumRequiredSystemSGPRs =
3220 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3221 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3222 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3223 Register Reg = Info.addReservedUserSGPR();
3224 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3225 CCInfo.AllocateReg(Reg);
3226 }
3227 }
3228
3229 if (!HasArchitectedSGPRs) {
3230 if (Info.hasWorkGroupIDX()) {
3231 Register Reg = Info.addWorkGroupIDX();
3232 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3233 CCInfo.AllocateReg(Reg);
3234 }
3235
3236 if (Info.hasWorkGroupIDY()) {
3237 Register Reg = Info.addWorkGroupIDY();
3238 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3239 CCInfo.AllocateReg(Reg);
3240 }
3241
3242 if (Info.hasWorkGroupIDZ()) {
3243 Register Reg = Info.addWorkGroupIDZ();
3244 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3245 CCInfo.AllocateReg(Reg);
3246 }
3247 }
3248
3249 if (Info.hasWorkGroupInfo()) {
3250 Register Reg = Info.addWorkGroupInfo();
3251 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3252 CCInfo.AllocateReg(Reg);
3253 }
3254
3255 if (Info.hasPrivateSegmentWaveByteOffset()) {
3256 // Scratch wave offset passed in system SGPR.
3257 unsigned PrivateSegmentWaveByteOffsetReg;
3258
3259 if (IsShader) {
3260 PrivateSegmentWaveByteOffsetReg =
3261 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3262
3263 // This is true if the scratch wave byte offset doesn't have a fixed
3264 // location.
3265 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3266 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3267 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3268 }
3269 } else
3270 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3271
3272 MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass);
3273 CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
3274 }
3275
3276 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3277 Info.getNumPreloadedSGPRs() >= 16);
3278}
3279
3280static void reservePrivateMemoryRegs(const TargetMachine &TM,
3281 MachineFunction &MF,
3282 const SIRegisterInfo &TRI,
3283 SIMachineFunctionInfo &Info) {
3284 // Now that we've figured out where the scratch register inputs are, see if
3285 // should reserve the arguments and use them directly.
3286 MachineFrameInfo &MFI = MF.getFrameInfo();
3287 bool HasStackObjects = MFI.hasStackObjects();
3288 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3289
3290 // Record that we know we have non-spill stack objects so we don't need to
3291 // check all stack objects later.
3292 if (HasStackObjects)
3293 Info.setHasNonSpillStackObjects(true);
3294
3295 // Everything live out of a block is spilled with fast regalloc, so it's
3296 // almost certain that spilling will be required.
3297 if (TM.getOptLevel() == CodeGenOptLevel::None)
3298 HasStackObjects = true;
3299
3300 // For now assume stack access is needed in any callee functions, so we need
3301 // the scratch registers to pass in.
3302 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3303
3304 if (!ST.hasFlatScratchEnabled()) {
3305 if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) {
3306 // If we have stack objects, we unquestionably need the private buffer
3307 // resource. For the Code Object V2 ABI, this will be the first 4 user
3308 // SGPR inputs. We can reserve those and use them directly.
3309
3310 Register PrivateSegmentBufferReg =
3311 Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
3312 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3313 } else {
3314 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3315 // We tentatively reserve the last registers (skipping the last registers
3316 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3317 // we'll replace these with the ones immediately after those which were
3318 // really allocated. In the prologue copies will be inserted from the
3319 // argument to these reserved registers.
3320
3321 // Without HSA, relocations are used for the scratch pointer and the
3322 // buffer resource setup is always inserted in the prologue. Scratch wave
3323 // offset is still in an input SGPR.
3324 Info.setScratchRSrcReg(ReservedBufferReg);
3325 }
3326 }
3327
3328 MachineRegisterInfo &MRI = MF.getRegInfo();
3329
3330 // For entry functions we have to set up the stack pointer if we use it,
3331 // whereas non-entry functions get this "for free". This means there is no
3332 // intrinsic advantage to using S32 over S34 in cases where we do not have
3333 // calls but do need a frame pointer (i.e. if we are requested to have one
3334 // because frame pointer elimination is disabled). To keep things simple we
3335 // only ever use S32 as the call ABI stack pointer, and so using it does not
3336 // imply we need a separate frame pointer.
3337 //
3338 // Try to use s32 as the SP, but move it if it would interfere with input
3339 // arguments. This won't work with calls though.
3340 //
3341 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3342 // registers.
3343 if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) {
3344 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3345 } else {
3346 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
3347
3348 if (MFI.hasCalls())
3349 report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
3350
3351 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3352 if (!MRI.isLiveIn(Reg)) {
3353 Info.setStackPtrOffsetReg(Reg);
3354 break;
3355 }
3356 }
3357
3358 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3359 report_fatal_error(reason: "failed to find register for SP");
3360 }
3361
3362 // hasFP should be accurate for entry functions even before the frame is
3363 // finalized, because it does not rely on the known stack size, only
3364 // properties like whether variable sized objects are present.
3365 if (ST.getFrameLowering()->hasFP(MF)) {
3366 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3367 }
3368}
3369
3370bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
3371 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3372 return !Info->isEntryFunction();
3373}
3374
3375void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}
3376
3377void SITargetLowering::insertCopiesSplitCSR(
3378 MachineBasicBlock *Entry,
3379 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3380 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3381
3382 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
3383 if (!IStart)
3384 return;
3385
3386 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3387 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3388 MachineBasicBlock::iterator MBBI = Entry->begin();
3389 for (const MCPhysReg *I = IStart; *I; ++I) {
3390 const TargetRegisterClass *RC = nullptr;
3391 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3392 RC = &AMDGPU::SGPR_64RegClass;
3393 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3394 RC = &AMDGPU::SGPR_32RegClass;
3395 else
3396 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3397
3398 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
3399 // Create copy from CSR to a virtual register.
3400 Entry->addLiveIn(PhysReg: *I);
3401 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
3402 .addReg(RegNo: *I);
3403
3404 // Insert the copy-back instructions right before the terminator.
3405 for (auto *Exit : Exits)
3406 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
3407 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
3408 .addReg(RegNo: NewVR);
3409 }
3410}
3411
3412SDValue SITargetLowering::LowerFormalArguments(
3413 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3414 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3415 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3416 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3417
3418 MachineFunction &MF = DAG.getMachineFunction();
3419 const Function &Fn = MF.getFunction();
3420 FunctionType *FType = MF.getFunction().getFunctionType();
3421 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3422 bool IsError = false;
3423
3424 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
3425 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3426 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3427 IsError = true;
3428 }
3429
3430 SmallVector<ISD::InputArg, 16> Splits;
3431 SmallVector<CCValAssign, 16> ArgLocs;
3432 BitVector Skipped(Ins.size());
3433 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3434 *DAG.getContext());
3435
3436 bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
3437 bool IsKernel = AMDGPU::isKernel(CC: CallConv);
3438 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
3439
3440 if (IsGraphics) {
3441 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3442 assert(!UserSGPRInfo.hasDispatchPtr() &&
3443 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3444 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3445 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3446 (void)UserSGPRInfo;
3447 if (!Subtarget->hasFlatScratchEnabled())
3448 assert(!UserSGPRInfo.hasFlatScratchInit());
3449 if ((CallConv != CallingConv::AMDGPU_CS &&
3450 CallConv != CallingConv::AMDGPU_Gfx &&
3451 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3452 !Subtarget->hasArchitectedSGPRs())
3453 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3454 !Info->hasWorkGroupIDZ());
3455 }
3456
3457 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3458
3459 if (CallConv == CallingConv::AMDGPU_PS) {
3460 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3461
3462 // At least one interpolation mode must be enabled or else the GPU will
3463 // hang.
3464 //
3465 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3466 // set PSInputAddr, the user wants to enable some bits after the compilation
3467 // based on run-time states. Since we can't know what the final PSInputEna
3468 // will look like, so we shouldn't do anything here and the user should take
3469 // responsibility for the correct programming.
3470 //
3471 // Otherwise, the following restrictions apply:
3472 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3473 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3474 // enabled too.
3475 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3476 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) {
3477 CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
3478 CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
3479 Info->markPSInputAllocated(Index: 0);
3480 Info->markPSInputEnabled(Index: 0);
3481 }
3482 if (Subtarget->isAmdPalOS()) {
3483 // For isAmdPalOS, the user does not enable some bits after compilation
3484 // based on run-time states; the register values being generated here are
3485 // the final ones set in hardware. Therefore we need to apply the
3486 // workaround to PSInputAddr and PSInputEnable together. (The case where
3487 // a bit is set in PSInputAddr but not PSInputEnable is where the
3488 // frontend set up an input arg for a particular interpolation mode, but
3489 // nothing uses that input arg. Really we should have an earlier pass
3490 // that removes such an arg.)
3491 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3492 if ((PsInputBits & 0x7F) == 0 ||
3493 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3494 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
3495 }
3496 } else if (IsKernel) {
3497 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3498 } else {
3499 Splits.append(in_start: IsWholeWaveFunc ? std::next(x: Ins.begin()) : Ins.begin(),
3500 in_end: Ins.end());
3501 }
3502
3503 if (IsKernel)
3504 analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
3505
3506 if (IsEntryFunc) {
3507 allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3508 allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3509 if (IsKernel && Subtarget->hasKernargPreload())
3510 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info);
3511
3512 allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info);
3513 } else if (!IsGraphics) {
3514 // For the fixed ABI, pass workitem IDs in the last argument register.
3515 allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
3516
3517 // FIXME: Sink this into allocateSpecialInputSGPRs
3518 if (!Subtarget->hasFlatScratchEnabled())
3519 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
3520
3521 allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3522 }
3523
3524 if (!IsKernel) {
3525 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
3526 CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
3527
3528 // This assumes the registers are allocated by CCInfo in ascending order
3529 // with no gaps.
3530 Info->setNumWaveDispatchSGPRs(
3531 CCInfo.getFirstUnallocated(Regs: AMDGPU::SGPR_32RegClass.getRegisters()));
3532 Info->setNumWaveDispatchVGPRs(
3533 CCInfo.getFirstUnallocated(Regs: AMDGPU::VGPR_32RegClass.getRegisters()));
3534 } else if (Info->getNumKernargPreloadedSGPRs()) {
3535 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3536 }
3537
3538 SmallVector<SDValue, 16> Chains;
3539
3540 if (IsWholeWaveFunc) {
3541 SDValue Setup = DAG.getNode(Opcode: AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3542 ResultTys: {MVT::i1, MVT::Other}, Ops: Chain);
3543 InVals.push_back(Elt: Setup.getValue(R: 0));
3544 Chains.push_back(Elt: Setup.getValue(R: 1));
3545 }
3546
3547 // FIXME: This is the minimum kernel argument alignment. We should improve
3548 // this to the maximum alignment of the arguments.
3549 //
3550 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3551 // kern arg offset.
3552 const Align KernelArgBaseAlign = Align(16);
3553
3554 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3555 ++i) {
3556 const ISD::InputArg &Arg = Ins[i];
3557 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3558 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3559 continue;
3560 }
3561
3562 CCValAssign &VA = ArgLocs[ArgIdx++];
3563 MVT VT = VA.getLocVT();
3564
3565 if (IsEntryFunc && VA.isMemLoc()) {
3566 VT = Ins[i].VT;
3567 EVT MemVT = VA.getLocVT();
3568
3569 const uint64_t Offset = VA.getLocMemOffset();
3570 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
3571
3572 if (Arg.Flags.isByRef()) {
3573 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
3574
3575 const GCNTargetMachine &TM =
3576 static_cast<const GCNTargetMachine &>(getTargetMachine());
3577 if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3578 DestAS: Arg.Flags.getPointerAddrSpace())) {
3579 Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3580 DestAS: Arg.Flags.getPointerAddrSpace());
3581 }
3582
3583 InVals.push_back(Elt: Ptr);
3584 continue;
3585 }
3586
3587 SDValue NewArg;
3588 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
3589 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3590 // In this case the argument is packed into the previous preload SGPR.
3591 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
3592 int64_t OffsetDiff = Offset - AlignDownOffset;
3593 EVT IntVT = MemVT.changeTypeToInteger();
3594
3595 const SIMachineFunctionInfo *Info =
3596 MF.getInfo<SIMachineFunctionInfo>();
3597 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3598 Register Reg =
3599 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0];
3600
3601 assert(Reg);
3602 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3603 SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3604
3605 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL, VT: MVT::i32);
3606 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt);
3607
3608 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
3609 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
3610 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
3611 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3612
3613 NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL);
3614 } else {
3615 const SIMachineFunctionInfo *Info =
3616 MF.getInfo<SIMachineFunctionInfo>();
3617 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3618 const SmallVectorImpl<MCRegister> &PreloadRegs =
3619 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
3620
3621 SDValue Copy;
3622 if (PreloadRegs.size() == 1) {
3623 Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]);
3624 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
3625 NewArg = DAG.getCopyFromReg(
3626 Chain, dl: DL, Reg: VReg,
3627 VT: EVT::getIntegerVT(Context&: *DAG.getContext(),
3628 BitWidth: TRI->getRegSizeInBits(RC: *RC)));
3629
3630 } else {
3631 // If the kernarg alignment does not match the alignment of the SGPR
3632 // tuple RC that can accommodate this argument, it will be built up
3633 // via copies from from the individual SGPRs that the argument was
3634 // preloaded to.
3635 SmallVector<SDValue, 4> Elts;
3636 for (auto Reg : PreloadRegs) {
3637 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3638 Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3639 Elts.push_back(Elt: Copy);
3640 }
3641 NewArg =
3642 DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
3643 NumElements: PreloadRegs.size()),
3644 DL, Ops: Elts);
3645 }
3646
3647 // If the argument was preloaded to multiple consecutive 32-bit
3648 // registers because of misalignment between addressable SGPR tuples
3649 // and the argument size, we can still assume that because of kernarg
3650 // segment alignment restrictions that NewArg's size is the same as
3651 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3652 // truncate since we cannot preload to less than a single SGPR and the
3653 // MemVT may be smaller.
3654 EVT MemVTInt =
3655 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
3656 if (MemVT.bitsLT(VT: NewArg.getSimpleValueType()))
3657 NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg);
3658
3659 NewArg = DAG.getBitcast(VT: MemVT, V: NewArg);
3660 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg,
3661 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3662 NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
3663 }
3664 } else {
3665 // Hidden arguments that are in the kernel signature must be preloaded
3666 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3667 // the argument list and is not preloaded.
3668 if (Arg.isOrigArg()) {
3669 Argument *OrigArg = Fn.getArg(i: Arg.getOrigArgIndex());
3670 if (OrigArg->hasAttribute(Kind: "amdgpu-hidden-argument")) {
3671 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3672 *OrigArg->getParent(),
3673 "hidden argument in kernel signature was not preloaded",
3674 DL.getDebugLoc()));
3675 }
3676 }
3677
3678 NewArg =
3679 lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
3680 Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3681 }
3682 Chains.push_back(Elt: NewArg.getValue(R: 1));
3683
3684 auto *ParamTy =
3685 dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex()));
3686 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3687 ParamTy &&
3688 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3689 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3690 // On SI local pointers are just offsets into LDS, so they are always
3691 // less than 16-bits. On CI and newer they could potentially be
3692 // real pointers, so we can't guarantee their size.
3693 NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg,
3694 N2: DAG.getValueType(MVT::i16));
3695 }
3696
3697 InVals.push_back(Elt: NewArg);
3698 continue;
3699 }
3700 if (!IsEntryFunc && VA.isMemLoc()) {
3701 SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3702 InVals.push_back(Elt: Val);
3703 if (!Arg.Flags.isByVal())
3704 Chains.push_back(Elt: Val.getValue(R: 1));
3705 continue;
3706 }
3707
3708 assert(VA.isRegLoc() && "Parameter must be in a register!");
3709
3710 Register Reg = VA.getLocReg();
3711 const TargetRegisterClass *RC = nullptr;
3712 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3713 RC = &AMDGPU::VGPR_32RegClass;
3714 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3715 RC = &AMDGPU::SGPR_32RegClass;
3716 else
3717 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3718
3719 Reg = MF.addLiveIn(PReg: Reg, RC);
3720 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3721 if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3722 // FIXME: Need to forward the chains created by `CopyFromReg`s, make sure
3723 // they will read physical regs before any side effect instructions.
3724 SDValue ReadFirstLane =
3725 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3726 Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
3727 N1: ReadFirstLane, N2: Val);
3728 }
3729
3730 if (Arg.Flags.isSRet()) {
3731 // The return object should be reasonably addressable.
3732
3733 // FIXME: This helps when the return is a real sret. If it is a
3734 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3735 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3736 unsigned NumBits =
3737 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3738 Val = DAG.getNode(
3739 Opcode: ISD::AssertZext, DL, VT, N1: Val,
3740 N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3741 }
3742
3743 Val = convertABITypeToValueType(DAG, Val, VA, SL: DL);
3744 InVals.push_back(Elt: Val);
3745 }
3746
3747 // Start adding system SGPRs.
3748 if (IsEntryFunc)
3749 allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3750
3751 unsigned StackArgSize = CCInfo.getStackSize();
3752 Info->setBytesInStackArgArea(StackArgSize);
3753
3754 return Chains.empty() ? Chain
3755 : DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains);
3756}
3757
3758// TODO: If return values can't fit in registers, we should return as many as
3759// possible in registers before passing on stack.
3760bool SITargetLowering::CanLowerReturn(
3761 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3762 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3763 const Type *RetTy) const {
3764 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3765 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3766 // for shaders. Vector types should be explicitly handled by CC.
3767 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3768 return true;
3769
3770 SmallVector<CCValAssign, 16> RVLocs;
3771 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3772 if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3773 return false;
3774
3775 // We must use the stack if return would require unavailable registers.
3776 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3777 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3778 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3779 if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i)))
3780 return false;
3781
3782 return true;
3783}
3784
3785SDValue
3786SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3787 bool isVarArg,
3788 const SmallVectorImpl<ISD::OutputArg> &Outs,
3789 const SmallVectorImpl<SDValue> &OutVals,
3790 const SDLoc &DL, SelectionDAG &DAG) const {
3791 MachineFunction &MF = DAG.getMachineFunction();
3792 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3793 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3794
3795 if (AMDGPU::isKernel(CC: CallConv)) {
3796 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3797 OutVals, DL, DAG);
3798 }
3799
3800 bool IsShader = AMDGPU::isShader(CC: CallConv);
3801
3802 Info->setIfReturnsVoid(Outs.empty());
3803 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3804
3805 // CCValAssign - represent the assignment of the return value to a location.
3806 SmallVector<CCValAssign, 48> RVLocs;
3807
3808 // CCState - Info about the registers and stack slots.
3809 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3810 *DAG.getContext());
3811
3812 // Analyze outgoing return values.
3813 CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3814
3815 SDValue Glue;
3816 SmallVector<SDValue, 48> RetOps;
3817 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3818
3819 SDValue ReadFirstLane =
3820 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3821 // Copy the result values into the output registers.
3822 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3823 ++I, ++RealRVLocIdx) {
3824 CCValAssign &VA = RVLocs[I];
3825 assert(VA.isRegLoc() && "Can only return in registers!");
3826 // TODO: Partially return in registers if return values don't fit.
3827 SDValue Arg = OutVals[RealRVLocIdx];
3828
3829 // Copied from other backends.
3830 switch (VA.getLocInfo()) {
3831 case CCValAssign::Full:
3832 break;
3833 case CCValAssign::BCvt:
3834 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3835 break;
3836 case CCValAssign::SExt:
3837 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3838 break;
3839 case CCValAssign::ZExt:
3840 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3841 break;
3842 case CCValAssign::AExt:
3843 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3844 break;
3845 default:
3846 llvm_unreachable("Unknown loc info!");
3847 }
3848 if (TRI->isSGPRPhysReg(Reg: VA.getLocReg()))
3849 Arg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Arg.getValueType(),
3850 N1: ReadFirstLane, N2: Arg);
3851 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3852 Glue = Chain.getValue(R: 1);
3853 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3854 }
3855
3856 // FIXME: Does sret work properly?
3857 if (!Info->isEntryFunction()) {
3858 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3859 const MCPhysReg *I =
3860 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3861 if (I) {
3862 for (; *I; ++I) {
3863 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3864 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
3865 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3866 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32));
3867 else
3868 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3869 }
3870 }
3871 }
3872
3873 // Update chain and glue.
3874 RetOps[0] = Chain;
3875 if (Glue.getNode())
3876 RetOps.push_back(Elt: Glue);
3877
3878 unsigned Opc = AMDGPUISD::ENDPGM;
3879 if (!IsWaveEnd)
3880 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3881 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3882 : AMDGPUISD::RET_GLUE;
3883 return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps);
3884}
3885
3886SDValue SITargetLowering::LowerCallResult(
3887 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3888 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3889 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3890 SDValue ThisVal) const {
3891 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3892
3893 // Assign locations to each value returned by this call.
3894 SmallVector<CCValAssign, 16> RVLocs;
3895 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3896 *DAG.getContext());
3897 CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3898
3899 // Copy all of the result registers out of their specified physreg.
3900 for (CCValAssign VA : RVLocs) {
3901 SDValue Val;
3902
3903 if (VA.isRegLoc()) {
3904 Val =
3905 DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3906 Chain = Val.getValue(R: 1);
3907 InGlue = Val.getValue(R: 2);
3908 } else if (VA.isMemLoc()) {
3909 report_fatal_error(reason: "TODO: return values in memory");
3910 } else
3911 llvm_unreachable("unknown argument location type");
3912
3913 switch (VA.getLocInfo()) {
3914 case CCValAssign::Full:
3915 break;
3916 case CCValAssign::BCvt:
3917 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3918 break;
3919 case CCValAssign::ZExt:
3920 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3921 N2: DAG.getValueType(VA.getValVT()));
3922 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3923 break;
3924 case CCValAssign::SExt:
3925 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3926 N2: DAG.getValueType(VA.getValVT()));
3927 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3928 break;
3929 case CCValAssign::AExt:
3930 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3931 break;
3932 default:
3933 llvm_unreachable("Unknown loc info!");
3934 }
3935
3936 InVals.push_back(Elt: Val);
3937 }
3938
3939 return Chain;
3940}
3941
3942// Add code to pass special inputs required depending on used features separate
3943// from the explicit user arguments present in the IR.
3944void SITargetLowering::passSpecialInputs(
3945 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3946 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3947 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3948 // If we don't have a call site, this was a call inserted by
3949 // legalization. These can never use special inputs.
3950 if (!CLI.CB)
3951 return;
3952
3953 SelectionDAG &DAG = CLI.DAG;
3954 const SDLoc &DL = CLI.DL;
3955 const Function &F = DAG.getMachineFunction().getFunction();
3956
3957 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3958 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3959
3960 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3961 AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
3962
3963 // TODO: Unify with private memory register handling. This is complicated by
3964 // the fact that at least in kernels, the input argument is not necessarily
3965 // in the same location as the input.
3966 // clang-format off
3967 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3968 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3969 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3970 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3971 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3972 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3973 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3974 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3975 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3976 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3977 };
3978 // clang-format on
3979
3980 for (auto [InputID, Attrs] : ImplicitAttrs) {
3981 // If the callee does not use the attribute value, skip copying the value.
3982 if (all_of(Range&: Attrs, P: [&](StringRef Attr) {
3983 return Attr.empty() || CLI.CB->hasFnAttr(Kind: Attr);
3984 }))
3985 continue;
3986
3987 const auto [OutgoingArg, ArgRC, ArgTy] =
3988 CalleeArgInfo.getPreloadedValue(Value: InputID);
3989 if (!OutgoingArg)
3990 continue;
3991
3992 const auto [IncomingArg, IncomingArgRC, Ty] =
3993 CallerArgInfo.getPreloadedValue(Value: InputID);
3994 assert(IncomingArgRC == ArgRC);
3995
3996 // All special arguments are ints for now.
3997 EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == 8 ? MVT::i64 : MVT::i32;
3998 SDValue InputReg;
3999
4000 if (IncomingArg) {
4001 InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
4002 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
4003 // The implicit arg ptr is special because it doesn't have a corresponding
4004 // input for kernels, and is computed from the kernarg segment pointer.
4005 InputReg = getImplicitArgPtr(DAG, SL: DL);
4006 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
4007 std::optional<uint32_t> Id =
4008 AMDGPUMachineFunctionInfo::getLDSKernelIdMetadata(F);
4009 if (Id.has_value()) {
4010 InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
4011 } else {
4012 InputReg = DAG.getPOISON(VT: ArgVT);
4013 }
4014 } else {
4015 // We may have proven the input wasn't needed, although the ABI is
4016 // requiring it. We just need to allocate the register appropriately.
4017 InputReg = DAG.getPOISON(VT: ArgVT);
4018 }
4019
4020 if (OutgoingArg->isRegister()) {
4021 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
4022 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
4023 report_fatal_error(reason: "failed to allocate implicit input argument");
4024 } else {
4025 unsigned SpecialArgOffset =
4026 CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4));
4027 SDValue ArgStore =
4028 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
4029 MemOpChains.push_back(Elt: ArgStore);
4030 }
4031 }
4032
4033 // Pack workitem IDs into a single register or pass it as is if already
4034 // packed.
4035
4036 auto [OutgoingArg, ArgRC, Ty] =
4037 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4038 if (!OutgoingArg)
4039 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
4040 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4041 if (!OutgoingArg)
4042 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
4043 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4044 if (!OutgoingArg)
4045 return;
4046
4047 const ArgDescriptor *IncomingArgX = std::get<0>(
4048 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
4049 const ArgDescriptor *IncomingArgY = std::get<0>(
4050 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
4051 const ArgDescriptor *IncomingArgZ = std::get<0>(
4052 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
4053
4054 SDValue InputReg;
4055 SDLoc SL;
4056
4057 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
4058 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
4059 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
4060
4061 // If incoming ids are not packed we need to pack them.
4062 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
4063 NeedWorkItemIDX) {
4064 if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 0) != 0) {
4065 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX);
4066 } else {
4067 InputReg = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
4068 }
4069 }
4070
4071 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
4072 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 1) != 0) {
4073 SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY);
4074 Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y,
4075 N2: DAG.getShiftAmountConstant(Val: 10, VT: MVT::i32, DL: SL));
4076 InputReg = InputReg.getNode()
4077 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y)
4078 : Y;
4079 }
4080
4081 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
4082 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 2) != 0) {
4083 SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ);
4084 Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z,
4085 N2: DAG.getShiftAmountConstant(Val: 20, VT: MVT::i32, DL: SL));
4086 InputReg = InputReg.getNode()
4087 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z)
4088 : Z;
4089 }
4090
4091 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4092 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4093 // We're in a situation where the outgoing function requires the workitem
4094 // ID, but the calling function does not have it (e.g a graphics function
4095 // calling a C calling convention function). This is illegal, but we need
4096 // to produce something.
4097 InputReg = DAG.getPOISON(VT: MVT::i32);
4098 } else {
4099 // Workitem ids are already packed, any of present incoming arguments
4100 // will carry all required fields.
4101 ArgDescriptor IncomingArg =
4102 ArgDescriptor::createArg(Arg: IncomingArgX ? *IncomingArgX
4103 : IncomingArgY ? *IncomingArgY
4104 : *IncomingArgZ,
4105 Mask: ~0u);
4106 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg);
4107 }
4108 }
4109
4110 if (OutgoingArg->isRegister()) {
4111 if (InputReg)
4112 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
4113
4114 CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
4115 } else {
4116 unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
4117 if (InputReg) {
4118 SDValue ArgStore =
4119 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
4120 MemOpChains.push_back(Elt: ArgStore);
4121 }
4122 }
4123}
4124
4125bool SITargetLowering::isEligibleForTailCallOptimization(
4126 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4127 const SmallVectorImpl<ISD::OutputArg> &Outs,
4128 const SmallVectorImpl<SDValue> &OutVals,
4129 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4130 if (AMDGPU::isChainCC(CC: CalleeCC))
4131 return true;
4132
4133 if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC))
4134 return false;
4135
4136 // For a divergent call target, we need to do a waterfall loop over the
4137 // possible callees which precludes us from using a simple jump.
4138 if (Callee->isDivergent())
4139 return false;
4140
4141 MachineFunction &MF = DAG.getMachineFunction();
4142 const Function &CallerF = MF.getFunction();
4143 CallingConv::ID CallerCC = CallerF.getCallingConv();
4144 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
4145 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4146
4147 // Kernels aren't callable, and don't have a live in return address so it
4148 // doesn't make sense to do a tail call with entry functions.
4149 if (!CallerPreserved)
4150 return false;
4151
4152 bool CCMatch = CallerCC == CalleeCC;
4153
4154 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4155 if (AMDGPU::canGuaranteeTCO(CC: CalleeCC) && CCMatch)
4156 return true;
4157 return false;
4158 }
4159
4160 // TODO: Can we handle var args?
4161 if (IsVarArg)
4162 return false;
4163
4164 for (const Argument &Arg : CallerF.args()) {
4165 if (Arg.hasByValAttr())
4166 return false;
4167 }
4168
4169 LLVMContext &Ctx = *DAG.getContext();
4170
4171 // Check that the call results are passed in the same way.
4172 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
4173 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
4174 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
4175 return false;
4176
4177 // The callee has to preserve all registers the caller needs to preserve.
4178 if (!CCMatch) {
4179 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4180 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
4181 return false;
4182 }
4183
4184 // Nothing more to check if the callee is taking no arguments.
4185 if (Outs.empty())
4186 return true;
4187
4188 SmallVector<CCValAssign, 16> ArgLocs;
4189 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4190
4191 // FIXME: We are not allocating special input registers, so we will be
4192 // deciding based on incorrect register assignments.
4193 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
4194
4195 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4196 // If the stack arguments for this call do not fit into our own save area then
4197 // the call cannot be made tail.
4198 // TODO: Is this really necessary?
4199 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4200 return false;
4201
4202 for (const auto &[CCVA, ArgVal] : zip_equal(t&: ArgLocs, u: OutVals)) {
4203 // FIXME: What about inreg arguments that end up passed in memory?
4204 if (!CCVA.isRegLoc())
4205 continue;
4206
4207 // If we are passing an argument in an SGPR, and the value is divergent,
4208 // this call requires a waterfall loop.
4209 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(Reg: CCVA.getLocReg())) {
4210 LLVM_DEBUG(
4211 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4212 << printReg(CCVA.getLocReg(), TRI) << '\n');
4213 return false;
4214 }
4215 }
4216
4217 const MachineRegisterInfo &MRI = MF.getRegInfo();
4218 return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
4219}
4220
4221bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
4222 if (!CI->isTailCall())
4223 return false;
4224
4225 const Function *ParentFn = CI->getFunction();
4226 if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
4227 return false;
4228 return true;
4229}
4230
4231namespace {
4232// Chain calls have special arguments that we need to handle. These are
4233// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4234// arguments (index 0 and 1 respectively).
4235enum ChainCallArgIdx {
4236 Exec = 2,
4237 Flags,
4238 NumVGPRs,
4239 FallbackExec,
4240 FallbackCallee
4241};
4242} // anonymous namespace
4243
4244// The wave scratch offset register is used as the global base pointer.
4245SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
4246 SmallVectorImpl<SDValue> &InVals) const {
4247 CallingConv::ID CallConv = CLI.CallConv;
4248 bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
4249
4250 SelectionDAG &DAG = CLI.DAG;
4251
4252 const SDLoc &DL = CLI.DL;
4253 SDValue Chain = CLI.Chain;
4254 SDValue Callee = CLI.Callee;
4255
4256 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4257 bool UsesDynamicVGPRs = false;
4258 if (IsChainCallConv) {
4259 // The last arguments should be the value that we need to put in EXEC,
4260 // followed by the flags and any other arguments with special meanings.
4261 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4262 // we don't treat them like the "real" arguments.
4263 auto RequestedExecIt =
4264 llvm::find_if(Range&: CLI.Outs, P: [](const ISD::OutputArg &Arg) {
4265 return Arg.OrigArgIndex == 2;
4266 });
4267 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4268
4269 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4270 CLI.OutVals.erase(CS: CLI.OutVals.begin() + SpecialArgsBeginIdx,
4271 CE: CLI.OutVals.end());
4272 CLI.Outs.erase(CS: RequestedExecIt, CE: CLI.Outs.end());
4273
4274 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4275 "Haven't popped all the special args");
4276
4277 TargetLowering::ArgListEntry RequestedExecArg =
4278 CLI.Args[ChainCallArgIdx::Exec];
4279 if (!RequestedExecArg.Ty->isIntegerTy(BitWidth: Subtarget->getWavefrontSize()))
4280 return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
4281
4282 // Convert constants into TargetConstants, so they become immediate operands
4283 // instead of being selected into S_MOV.
4284 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4285 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Val&: Arg.Node)) {
4286 ChainCallSpecialArgs.push_back(Elt: DAG.getTargetConstant(
4287 Val: ArgNode->getAPIntValue(), DL, VT: ArgNode->getValueType(ResNo: 0)));
4288 } else
4289 ChainCallSpecialArgs.push_back(Elt: Arg.Node);
4290 };
4291
4292 PushNodeOrTargetConstant(RequestedExecArg);
4293
4294 // Process any other special arguments depending on the value of the flags.
4295 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4296
4297 const APInt &FlagsValue = cast<ConstantSDNode>(Val&: Flags.Node)->getAPIntValue();
4298 if (FlagsValue.isZero()) {
4299 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4300 return lowerUnhandledCall(CLI, InVals,
4301 Reason: "no additional args allowed if flags == 0");
4302 } else if (FlagsValue.isOneBitSet(BitNo: 0)) {
4303 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4304 return lowerUnhandledCall(CLI, InVals, Reason: "expected 3 additional args");
4305 }
4306
4307 if (!Subtarget->isWave32()) {
4308 return lowerUnhandledCall(
4309 CLI, InVals, Reason: "dynamic VGPR mode is only supported for wave32");
4310 }
4311
4312 UsesDynamicVGPRs = true;
4313 std::for_each(first: CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4314 last: CLI.Args.end(), f: PushNodeOrTargetConstant);
4315 }
4316 }
4317
4318 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
4319 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4320 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
4321 bool &IsTailCall = CLI.IsTailCall;
4322 bool IsVarArg = CLI.IsVarArg;
4323 bool IsSibCall = false;
4324 MachineFunction &MF = DAG.getMachineFunction();
4325
4326 if (Callee.isUndef() || isNullConstant(V: Callee)) {
4327 if (!CLI.IsTailCall) {
4328 for (ISD::InputArg &Arg : CLI.Ins)
4329 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
4330 }
4331
4332 return Chain;
4333 }
4334
4335 if (IsVarArg) {
4336 return lowerUnhandledCall(CLI, InVals,
4337 Reason: "unsupported call to variadic function ");
4338 }
4339
4340 if (!CLI.CB)
4341 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported libcall legalization");
4342
4343 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4344 return lowerUnhandledCall(CLI, InVals,
4345 Reason: "unsupported required tail call to function ");
4346 }
4347
4348 if (IsTailCall) {
4349 IsTailCall = isEligibleForTailCallOptimization(Callee, CalleeCC: CallConv, IsVarArg,
4350 Outs, OutVals, Ins, DAG);
4351 if (!IsTailCall &&
4352 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4353 report_fatal_error(reason: "failed to perform tail call elimination on a call "
4354 "site marked musttail or on llvm.amdgcn.cs.chain");
4355 }
4356
4357 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4358
4359 // A sibling call is one where we're under the usual C ABI and not planning
4360 // to change that but can still do a tail call:
4361 if (!TailCallOpt && IsTailCall)
4362 IsSibCall = true;
4363
4364 if (IsTailCall)
4365 ++NumTailCalls;
4366 }
4367
4368 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4369 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
4370 SmallVector<SDValue, 8> MemOpChains;
4371
4372 // Analyze operands of the call, assigning locations to each operand.
4373 SmallVector<CCValAssign, 16> ArgLocs;
4374 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4375 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
4376
4377 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv) &&
4378 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
4379 // With a fixed ABI, allocate fixed registers before user arguments.
4380 passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
4381 }
4382
4383 // Mark the scratch resource descriptor as allocated so the CC analysis
4384 // does not assign user arguments to these registers, matching the callee.
4385 if (!Subtarget->hasFlatScratchEnabled())
4386 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
4387
4388 CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
4389
4390 // Get a count of how many bytes are to be pushed on the stack.
4391 unsigned NumBytes = CCInfo.getStackSize();
4392
4393 if (IsSibCall) {
4394 // Since we're not changing the ABI to make this a tail call, the memory
4395 // operands are already available in the caller's incoming argument space.
4396 NumBytes = 0;
4397 }
4398
4399 // FPDiff is the byte offset of the call's argument area from the callee's.
4400 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4401 // by this amount for a tail call. In a sibling call it must be 0 because the
4402 // caller will deallocate the entire stack and the callee still expects its
4403 // arguments to begin at SP+0. Completely unused for non-tail calls.
4404 int32_t FPDiff = 0;
4405 MachineFrameInfo &MFI = MF.getFrameInfo();
4406 auto *TRI = Subtarget->getRegisterInfo();
4407
4408 // Adjust the stack pointer for the new arguments...
4409 // These operations are automatically eliminated by the prolog/epilog pass
4410 if (!IsSibCall)
4411 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL);
4412
4413 if (!IsSibCall || IsChainCallConv) {
4414 if (!Subtarget->hasFlatScratchEnabled()) {
4415 SmallVector<SDValue, 4> CopyFromChains;
4416
4417 // In the HSA case, this should be an identity copy.
4418 SDValue ScratchRSrcReg =
4419 DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
4420 RegsToPass.emplace_back(Args: IsChainCallConv
4421 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4422 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4423 Args&: ScratchRSrcReg);
4424 CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1));
4425 Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
4426 }
4427 }
4428
4429 const unsigned NumSpecialInputs = RegsToPass.size();
4430
4431 MVT PtrVT = MVT::i32;
4432
4433 // Walk the register/memloc assignments, inserting copies/loads.
4434 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4435 CCValAssign &VA = ArgLocs[i];
4436 SDValue Arg = OutVals[i];
4437
4438 // Promote the value if needed.
4439 switch (VA.getLocInfo()) {
4440 case CCValAssign::Full:
4441 break;
4442 case CCValAssign::BCvt:
4443 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
4444 break;
4445 case CCValAssign::ZExt:
4446 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4447 break;
4448 case CCValAssign::SExt:
4449 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4450 break;
4451 case CCValAssign::AExt:
4452 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4453 break;
4454 case CCValAssign::FPExt:
4455 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4456 break;
4457 default:
4458 llvm_unreachable("Unknown loc info!");
4459 }
4460
4461 if (VA.isRegLoc()) {
4462 RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
4463 } else {
4464 assert(VA.isMemLoc());
4465
4466 SDValue DstAddr;
4467 MachinePointerInfo DstInfo;
4468
4469 unsigned LocMemOffset = VA.getLocMemOffset();
4470 int32_t Offset = LocMemOffset;
4471
4472 SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
4473 MaybeAlign Alignment;
4474
4475 if (IsTailCall) {
4476 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4477 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4478 : VA.getValVT().getStoreSize();
4479
4480 // FIXME: We can have better than the minimum byval required alignment.
4481 Alignment =
4482 Flags.isByVal()
4483 ? Flags.getNonZeroByValAlign()
4484 : commonAlignment(A: Subtarget->getStackAlignment(), Offset);
4485
4486 Offset = Offset + FPDiff;
4487 int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
4488
4489 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
4490 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4491
4492 // Make sure any stack arguments overlapping with where we're storing
4493 // are loaded before this eventual operation. Otherwise they'll be
4494 // clobbered.
4495
4496 // FIXME: Why is this really necessary? This seems to just result in a
4497 // lot of code to copy the stack and write them back to the same
4498 // locations, which are supposed to be immutable?
4499 Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
4500 } else {
4501 // Stores to the argument stack area are relative to the stack pointer.
4502 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(),
4503 VT: MVT::i32);
4504 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff);
4505 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
4506 Alignment =
4507 commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
4508 }
4509
4510 if (Outs[i].Flags.isByVal()) {
4511 SDValue SizeNode =
4512 DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i32);
4513 SDValue Cpy =
4514 DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
4515 DstAlign: Outs[i].Flags.getNonZeroByValAlign(),
4516 SrcAlign: Outs[i].Flags.getNonZeroByValAlign(),
4517 /*isVol = */ false, /*AlwaysInline = */ true,
4518 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo,
4519 SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
4520
4521 MemOpChains.push_back(Elt: Cpy);
4522 } else {
4523 SDValue Store =
4524 DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
4525 MemOpChains.push_back(Elt: Store);
4526 }
4527 }
4528 }
4529
4530 if (!MemOpChains.empty())
4531 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
4532
4533 SDValue ReadFirstLaneID =
4534 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4535
4536 SDValue TokenGlue;
4537 if (CLI.ConvergenceControlToken) {
4538 TokenGlue = DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL, VT: MVT::Glue,
4539 Operand: CLI.ConvergenceControlToken);
4540 }
4541
4542 // Build a sequence of copy-to-reg nodes chained together with token chain
4543 // and flag operands which copy the outgoing args into the appropriate regs.
4544 SDValue InGlue;
4545
4546 unsigned ArgIdx = 0;
4547 for (auto [Reg, Val] : RegsToPass) {
4548 if (ArgIdx++ >= NumSpecialInputs &&
4549 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4550 // For chain calls, the inreg arguments are required to be
4551 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4552 // they are uniform.
4553 //
4554 // For other calls, if an inreg arguments is known to be uniform,
4555 // speculatively insert a readfirstlane in case it is in a VGPR.
4556 //
4557 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4558 // value, so let that continue to produce invalid code.
4559
4560 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4561 if (TokenGlue)
4562 ReadfirstlaneArgs.push_back(Elt: TokenGlue);
4563 Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
4564 Ops: ReadfirstlaneArgs);
4565 }
4566
4567 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: Val, Glue: InGlue);
4568 InGlue = Chain.getValue(R: 1);
4569 }
4570
4571 // We don't usually want to end the call-sequence here because we would tidy
4572 // the frame up *after* the call, however in the ABI-changing tail-call case
4573 // we've carefully laid out the parameters so that when sp is reset they'll be
4574 // in the correct location.
4575 if (IsTailCall && !IsSibCall) {
4576 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL);
4577 InGlue = Chain.getValue(R: 1);
4578 }
4579
4580 std::vector<SDValue> Ops({Chain});
4581
4582 // Add a redundant copy of the callee global which will not be legalized, as
4583 // we need direct access to the callee later.
4584 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
4585 const GlobalValue *GV = GSD->getGlobal();
4586 Ops.push_back(x: Callee);
4587 Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64));
4588 } else {
4589 if (IsTailCall) {
4590 // isEligibleForTailCallOptimization considered whether the call target is
4591 // divergent, but we may still end up with a uniform value in a VGPR.
4592 // Insert a readfirstlane just in case.
4593 SDValue ReadFirstLaneID =
4594 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4595
4596 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4597 if (TokenGlue)
4598 ReadfirstlaneArgs.push_back(Elt: TokenGlue); // Wire up convergence token.
4599 Callee = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Callee.getValueType(),
4600 Ops: ReadfirstlaneArgs);
4601 }
4602
4603 Ops.push_back(x: Callee);
4604 Ops.push_back(x: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64));
4605 }
4606
4607 if (IsTailCall) {
4608 // Each tail call may have to adjust the stack by a different amount, so
4609 // this information must travel along with the operation for eventual
4610 // consumption by emitEpilogue.
4611 Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
4612 }
4613
4614 if (IsChainCallConv)
4615 llvm::append_range(C&: Ops, R&: ChainCallSpecialArgs);
4616
4617 // Add argument registers to the end of the list so that they are known live
4618 // into the call.
4619 for (auto &[Reg, Val] : RegsToPass)
4620 Ops.push_back(x: DAG.getRegister(Reg, VT: Val.getValueType()));
4621
4622 // Add a register mask operand representing the call-preserved registers.
4623 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4624 assert(Mask && "Missing call preserved mask for calling convention");
4625 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
4626
4627 if (SDValue Token = CLI.ConvergenceControlToken) {
4628 SmallVector<SDValue, 2> GlueOps;
4629 GlueOps.push_back(Elt: Token);
4630 if (InGlue)
4631 GlueOps.push_back(Elt: InGlue);
4632
4633 InGlue = SDValue(DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL,
4634 VT: MVT::Glue, Ops: GlueOps),
4635 0);
4636 }
4637
4638 if (InGlue)
4639 Ops.push_back(x: InGlue);
4640
4641 // If we're doing a tall call, use a TC_RETURN here rather than an
4642 // actual call instruction.
4643 if (IsTailCall) {
4644 MFI.setHasTailCall();
4645 unsigned OPC = AMDGPUISD::TC_RETURN;
4646 switch (CallConv) {
4647 case CallingConv::AMDGPU_Gfx:
4648 OPC = AMDGPUISD::TC_RETURN_GFX;
4649 break;
4650 case CallingConv::AMDGPU_CS_Chain:
4651 case CallingConv::AMDGPU_CS_ChainPreserve:
4652 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4653 : AMDGPUISD::TC_RETURN_CHAIN;
4654 break;
4655 }
4656
4657 // If the caller is a whole wave function, we need to use a special opcode
4658 // so we can patch up EXEC.
4659 if (Info->isWholeWaveFunction())
4660 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4661
4662 return DAG.getNode(Opcode: OPC, DL, VT: MVT::Other, Ops);
4663 }
4664
4665 // Returns a chain and a flag for retval copy to use.
4666 SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
4667 Chain = Call.getValue(R: 0);
4668 InGlue = Call.getValue(R: 1);
4669
4670 uint64_t CalleePopBytes = NumBytes;
4671 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL);
4672 if (!Ins.empty())
4673 InGlue = Chain.getValue(R: 1);
4674
4675 // Handle result values, copying them out of physregs into vregs that we
4676 // return.
4677 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4678 InVals, /*IsThisReturn=*/false, ThisVal: SDValue());
4679}
4680
4681// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4682// except for:
4683// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4684// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4685SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4686 SelectionDAG &DAG) const {
4687 const MachineFunction &MF = DAG.getMachineFunction();
4688 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4689
4690 SDLoc dl(Op);
4691 EVT VT = Op.getValueType();
4692 SDValue Chain = Op.getOperand(i: 0);
4693 Register SPReg = Info->getStackPtrOffsetReg();
4694
4695 // Chain the dynamic stack allocation so that it doesn't modify the stack
4696 // pointer when other instructions are using the stack.
4697 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
4698
4699 SDValue Size = Op.getOperand(i: 1);
4700 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
4701 Align Alignment = cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getAlignValue();
4702
4703 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4704 assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4705 "Stack grows upwards for AMDGPU");
4706
4707 Chain = BaseAddr.getValue(R: 1);
4708 // When using flat-scratch, the stack offset is unscaled.
4709 const bool HasFlatScratch = Subtarget->hasFlatScratchEnabled();
4710 const unsigned WavefrontSizeLog2 = Subtarget->getWavefrontSizeLog2();
4711
4712 Align StackAlign = TFL->getStackAlign();
4713 if (Alignment > StackAlign) {
4714 uint64_t ScaledAlignment = Alignment.value()
4715 << (HasFlatScratch ? 0 : WavefrontSizeLog2);
4716 uint64_t StackAlignMask = ScaledAlignment - 1;
4717 SDValue TmpAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr,
4718 N2: DAG.getConstant(Val: StackAlignMask, DL: dl, VT));
4719 BaseAddr = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: TmpAddr,
4720 N2: DAG.getSignedConstant(Val: -ScaledAlignment, DL: dl, VT));
4721 }
4722
4723 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4724 SDValue NewSP;
4725 if (isa<ConstantSDNode>(Val: Size)) {
4726 // Increase the stack pointer by the size of the alloca.
4727 // If not using flat-scratch, we have to scale the size by the wave-size.
4728 SDValue ScaledSize =
4729 HasFlatScratch
4730 ? Size
4731 : DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4732 N2: DAG.getConstant(Val: WavefrontSizeLog2, DL: dl, VT: MVT::i32));
4733 NewSP = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value
4734 } else {
4735 // For dynamic sized alloca, perform wave-wide reduction to get max of
4736 // alloca size(divergent), and then scale it (when not using flat-scratch)
4737 // by wave-size.
4738 SDValue WaveReduction =
4739 DAG.getTargetConstant(Val: Intrinsic::amdgcn_wave_reduce_umax, DL: dl, VT: MVT::i32);
4740 Size = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: WaveReduction,
4741 N2: Size, N3: DAG.getTargetConstant(Val: 0, DL: dl, VT: MVT::i32));
4742 SDValue ScaledSize = Size;
4743 if (!HasFlatScratch) {
4744 ScaledSize =
4745 DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4746 N2: DAG.getConstant(Val: WavefrontSizeLog2, DL: dl, VT: MVT::i32));
4747 }
4748 NewSP =
4749 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value in vgpr.
4750 SDValue ReadFirstLaneID =
4751 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: dl, VT: MVT::i32);
4752 NewSP = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: ReadFirstLaneID,
4753 N2: NewSP);
4754 }
4755
4756 Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: NewSP); // Output chain
4757 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
4758
4759 return DAG.getMergeValues(Ops: {BaseAddr, CallSeqEnd}, dl);
4760}
4761
4762SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4763 if (Op.getValueType() != MVT::i32)
4764 return Op; // Defer to cannot select error.
4765
4766 Register SP = getStackPointerRegisterToSaveRestore();
4767 SDLoc SL(Op);
4768
4769 SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op->getOperand(Num: 0), dl: SL, Reg: SP, VT: MVT::i32);
4770
4771 // Convert from wave uniform to swizzled vector address. This should protect
4772 // from any edge cases where the stacksave result isn't directly used with
4773 // stackrestore.
4774 SDValue VectorAddress =
4775 DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP);
4776 return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL);
4777}
4778
4779SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4780 SelectionDAG &DAG) const {
4781 SDLoc SL(Op);
4782 assert(Op.getValueType() == MVT::i32);
4783
4784 uint32_t BothRoundHwReg =
4785 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4786 SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4787
4788 SDValue IntrinID =
4789 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4790 SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(),
4791 N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm);
4792
4793 // There are two rounding modes, one for f32 and one for f64/f16. We only
4794 // report in the standard value range if both are the same.
4795 //
4796 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4797 // ties away from zero is not supported, and the other values are rotated by
4798 // 1.
4799 //
4800 // If the two rounding modes are not the same, report a target defined value.
4801
4802 // Mode register rounding mode fields:
4803 //
4804 // [1:0] Single-precision round mode.
4805 // [3:2] Double/Half-precision round mode.
4806 //
4807 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4808 //
4809 // Hardware Spec
4810 // Toward-0 3 0
4811 // Nearest Even 0 1
4812 // +Inf 1 2
4813 // -Inf 2 3
4814 // NearestAway0 N/A 4
4815 //
4816 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4817 // table we can index by the raw hardware mode.
4818 //
4819 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4820
4821 SDValue BitTable =
4822 DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64);
4823
4824 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4825 SDValue RoundModeTimesNumBits =
4826 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two);
4827
4828 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4829 // knew only one mode was demanded.
4830 SDValue TableValue =
4831 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4832 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4833
4834 SDValue EntryMask = DAG.getConstant(Val: 0xf, DL: SL, VT: MVT::i32);
4835 SDValue TableEntry =
4836 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask);
4837
4838 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4839 // if it's an extended value.
4840 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4841 SDValue IsStandardValue =
4842 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT);
4843 SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four);
4844 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue,
4845 N2: TableEntry, N3: EnumOffset);
4846
4847 return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL);
4848}
4849
4850SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4851 SelectionDAG &DAG) const {
4852 SDLoc SL(Op);
4853
4854 SDValue NewMode = Op.getOperand(i: 1);
4855 assert(NewMode.getValueType() == MVT::i32);
4856
4857 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4858 // hardware MODE.fp_round values.
4859 if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) {
4860 uint32_t ClampedVal = std::min(
4861 a: static_cast<uint32_t>(ConstMode->getZExtValue()),
4862 b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4863 NewMode = DAG.getConstant(
4864 Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32);
4865 } else {
4866 // If we know the input can only be one of the supported standard modes in
4867 // the range 0-3, we can use a simplified mapping to hardware values.
4868 KnownBits KB = DAG.computeKnownBits(Op: NewMode);
4869 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4870 // The supported standard values are 0-3. The extended values start at 8. We
4871 // need to offset by 4 if the value is in the extended range.
4872
4873 if (UseReducedTable) {
4874 // Truncate to the low 32-bits.
4875 SDValue BitTable = DAG.getConstant(
4876 Val: AMDGPU::FltRoundToHWConversionTable & 0xffff, DL: SL, VT: MVT::i32);
4877
4878 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4879 SDValue RoundModeTimesNumBits =
4880 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two);
4881
4882 NewMode =
4883 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits);
4884
4885 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4886 // the table extracted bits into inline immediates.
4887 } else {
4888 // table_index = umin(value, value - 4)
4889 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4890 SDValue BitTable =
4891 DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64);
4892
4893 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4894 SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four);
4895 SDValue IndexVal =
4896 DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum);
4897
4898 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4899 SDValue RoundModeTimesNumBits =
4900 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two);
4901
4902 SDValue TableValue =
4903 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4904 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4905
4906 // No need to mask out the high bits since the setreg will ignore them
4907 // anyway.
4908 NewMode = TruncTable;
4909 }
4910
4911 // Insert a readfirstlane in case the value is a VGPR. We could do this
4912 // earlier and keep more operations scalar, but that interferes with
4913 // combining the source.
4914 SDValue ReadFirstLaneID =
4915 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4916 NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4917 N1: ReadFirstLaneID, N2: NewMode);
4918 }
4919
4920 // N.B. The setreg will be later folded into s_round_mode on supported
4921 // targets.
4922 SDValue IntrinID =
4923 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4924 uint32_t BothRoundHwReg =
4925 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4926 SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4927
4928 SDValue SetReg =
4929 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op->getVTList(), N1: Op.getOperand(i: 0),
4930 N2: IntrinID, N3: RoundBothImm, N4: NewMode);
4931
4932 return SetReg;
4933}
4934
4935SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4936 if (Op->isDivergent() &&
4937 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(i: 4)))
4938 // Cannot do I$ prefetch with divergent pointer.
4939 return SDValue();
4940
4941 switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4942 case AMDGPUAS::FLAT_ADDRESS:
4943 case AMDGPUAS::GLOBAL_ADDRESS:
4944 case AMDGPUAS::CONSTANT_ADDRESS:
4945 break;
4946 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4947 if (Subtarget->hasSafeSmemPrefetch())
4948 break;
4949 [[fallthrough]];
4950 default:
4951 return SDValue();
4952 }
4953
4954 // I$ prefetch
4955 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(i: 4))
4956 return SDValue();
4957
4958 return Op;
4959}
4960
4961// Work around DAG legality rules only based on the result type.
4962SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4963 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4964 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
4965 EVT SrcVT = Src.getValueType();
4966
4967 if (SrcVT.getScalarType() != MVT::bf16)
4968 return Op;
4969
4970 SDLoc SL(Op);
4971 SDValue BitCast =
4972 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4973
4974 EVT DstVT = Op.getValueType();
4975 if (IsStrict)
4976 llvm_unreachable("Need STRICT_BF16_TO_FP");
4977
4978 return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4979}
4980
4981SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4982 SDLoc SL(Op);
4983 if (Op.getValueType() != MVT::i64)
4984 return Op;
4985
4986 uint32_t ModeHwReg =
4987 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4988 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4989 uint32_t TrapHwReg =
4990 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4991 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4992
4993 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
4994 SDValue IntrinID =
4995 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4996 SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4997 N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm);
4998 SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4999 N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm);
5000 SDValue TokenReg =
5001 DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: 1),
5002 N2: GetTrapReg.getValue(R: 1));
5003
5004 SDValue CvtPtr =
5005 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg);
5006 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
5007
5008 return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
5009}
5010
5011SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
5012 SDLoc SL(Op);
5013 if (Op.getOperand(i: 1).getValueType() != MVT::i64)
5014 return Op;
5015
5016 SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
5017 SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
5018 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
5019 SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
5020 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
5021
5022 SDValue ReadFirstLaneID =
5023 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
5024 NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
5025 N1: ReadFirstLaneID, N2: NewModeReg);
5026 NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
5027 N1: ReadFirstLaneID, N2: NewTrapReg);
5028
5029 unsigned ModeHwReg =
5030 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
5031 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
5032 unsigned TrapHwReg =
5033 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
5034 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
5035
5036 SDValue IntrinID =
5037 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
5038 SDValue SetModeReg =
5039 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
5040 N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg);
5041 SDValue SetTrapReg =
5042 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
5043 N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg);
5044 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg);
5045}
5046
5047Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
5048 const MachineFunction &MF) const {
5049 const Function &Fn = MF.getFunction();
5050
5051 Register Reg = StringSwitch<Register>(RegName)
5052 .Case(S: "m0", Value: AMDGPU::M0)
5053 .Case(S: "exec", Value: AMDGPU::EXEC)
5054 .Case(S: "exec_lo", Value: AMDGPU::EXEC_LO)
5055 .Case(S: "exec_hi", Value: AMDGPU::EXEC_HI)
5056 .Case(S: "flat_scratch", Value: AMDGPU::FLAT_SCR)
5057 .Case(S: "flat_scratch_lo", Value: AMDGPU::FLAT_SCR_LO)
5058 .Case(S: "flat_scratch_hi", Value: AMDGPU::FLAT_SCR_HI)
5059 .Default(Value: Register());
5060 if (!Reg)
5061 return Reg;
5062
5063 if (!Subtarget->hasFlatScrRegister() &&
5064 Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) {
5065 Fn.getContext().emitError(ErrorStr: Twine("invalid register \"" + StringRef(RegName) +
5066 "\" for subtarget."));
5067 }
5068
5069 switch (Reg) {
5070 case AMDGPU::M0:
5071 case AMDGPU::EXEC_LO:
5072 case AMDGPU::EXEC_HI:
5073 case AMDGPU::FLAT_SCR_LO:
5074 case AMDGPU::FLAT_SCR_HI:
5075 if (VT.getSizeInBits() == 32)
5076 return Reg;
5077 break;
5078 case AMDGPU::EXEC:
5079 case AMDGPU::FLAT_SCR:
5080 if (VT.getSizeInBits() == 64)
5081 return Reg;
5082 break;
5083 default:
5084 llvm_unreachable("missing register type checking");
5085 }
5086
5087 report_fatal_error(
5088 reason: Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
5089}
5090
5091// If kill is not the last instruction, split the block so kill is always a
5092// proper terminator.
5093MachineBasicBlock *
5094SITargetLowering::splitKillBlock(MachineInstr &MI,
5095 MachineBasicBlock *BB) const {
5096 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, /*UpdateLiveIns=*/true);
5097 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5098 MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
5099 return SplitBB;
5100}
5101
5102// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5103// \p MI will be the only instruction in the loop body block. Otherwise, it will
5104// be the first instruction in the remainder block.
5105//
5106/// \returns { LoopBody, Remainder }
5107static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5108splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
5109 MachineFunction *MF = MBB.getParent();
5110 MachineBasicBlock::iterator I(&MI);
5111
5112 // To insert the loop we need to split the block. Move everything after this
5113 // point to a new block, and insert a new empty block between the two.
5114 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
5115 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5116 MachineFunction::iterator MBBI(MBB);
5117 ++MBBI;
5118
5119 MF->insert(MBBI, MBB: LoopBB);
5120 MF->insert(MBBI, MBB: RemainderBB);
5121
5122 LoopBB->addSuccessor(Succ: LoopBB);
5123 LoopBB->addSuccessor(Succ: RemainderBB);
5124
5125 // Move the rest of the block into a new block.
5126 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
5127
5128 if (InstInLoop) {
5129 auto Next = std::next(x: I);
5130
5131 // Move instruction to loop body.
5132 LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
5133
5134 // Move the rest of the block.
5135 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
5136 } else {
5137 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
5138 }
5139
5140 MBB.addSuccessor(Succ: LoopBB);
5141
5142 return std::pair(LoopBB, RemainderBB);
5143}
5144
5145/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5146void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
5147 MachineBasicBlock *MBB = MI.getParent();
5148 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5149 auto I = MI.getIterator();
5150 auto E = std::next(x: I);
5151
5152 // clang-format off
5153 BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT))
5154 .addImm(Val: 0);
5155 // clang-format on
5156
5157 MIBundleBuilder Bundler(*MBB, I, E);
5158 finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
5159}
5160
5161MachineBasicBlock *
5162SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
5163 MachineBasicBlock *BB) const {
5164 const DebugLoc &DL = MI.getDebugLoc();
5165
5166 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5167
5168 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5169
5170 // Apparently kill flags are only valid if the def is in the same block?
5171 if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0))
5172 Src->setIsKill(false);
5173
5174 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true);
5175
5176 MachineBasicBlock::iterator I = LoopBB->end();
5177
5178 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5179 Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: 1);
5180
5181 // Clear TRAP_STS.MEM_VIOL
5182 BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
5183 .addImm(Val: 0)
5184 .addImm(Val: EncodedReg);
5185
5186 bundleInstWithWaitcnt(MI);
5187
5188 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5189
5190 // Load and check TRAP_STS.MEM_VIOL
5191 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg)
5192 .addImm(Val: EncodedReg);
5193
5194 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5195 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5196 .addReg(RegNo: Reg, Flags: RegState::Kill)
5197 .addImm(Val: 0);
5198 // clang-format off
5199 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5200 .addMBB(MBB: LoopBB);
5201 // clang-format on
5202
5203 return RemainderBB;
5204}
5205
5206// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5207// wavefront. If the value is uniform and just happens to be in a VGPR, this
5208// will only do one iteration. In the worst case, this will loop 64 times.
5209//
5210// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5211static MachineBasicBlock::iterator
5212emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
5213 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5214 const DebugLoc &DL, const MachineOperand &Idx,
5215 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5216 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5217 Register &SGPRIdxReg) {
5218
5219 MachineFunction *MF = OrigBB.getParent();
5220 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5221 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5222 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5223 MachineBasicBlock::iterator I = LoopBB.begin();
5224
5225 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5226 Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
5227 Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
5228 Register CurrentIdxReg =
5229 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5230 Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
5231
5232 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
5233 .addReg(RegNo: InitReg)
5234 .addMBB(MBB: &OrigBB)
5235 .addReg(RegNo: ResultReg)
5236 .addMBB(MBB: &LoopBB);
5237
5238 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
5239 .addReg(RegNo: InitSaveExecReg)
5240 .addMBB(MBB: &OrigBB)
5241 .addReg(RegNo: NewExec)
5242 .addMBB(MBB: &LoopBB);
5243
5244 // Read the next variant <- also loop target.
5245 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg)
5246 .addReg(RegNo: Idx.getReg(), Flags: getUndefRegState(B: Idx.isUndef()));
5247
5248 // Compare the just read M0 value to all possible Idx values.
5249 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg)
5250 .addReg(RegNo: CurrentIdxReg)
5251 .addReg(RegNo: Idx.getReg(), Flags: {}, SubReg: Idx.getSubReg());
5252
5253 // Update EXEC, save the original EXEC value to VCC.
5254 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.AndSaveExecOpc), DestReg: NewExec)
5255 .addReg(RegNo: CondReg, Flags: RegState::Kill);
5256
5257 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
5258
5259 if (UseGPRIdxMode) {
5260 if (Offset == 0) {
5261 SGPRIdxReg = CurrentIdxReg;
5262 } else {
5263 SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
5264 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg)
5265 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5266 .addImm(Val: Offset);
5267 }
5268 } else {
5269 // Move index from VCC into M0
5270 if (Offset == 0) {
5271 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5272 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill);
5273 } else {
5274 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5275 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5276 .addImm(Val: Offset);
5277 }
5278 }
5279
5280 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5281 MachineInstr *InsertPt =
5282 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
5283 .addReg(RegNo: LMC.ExecReg)
5284 .addReg(RegNo: NewExec);
5285
5286 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5287 // s_cbranch_scc0?
5288
5289 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5290 // clang-format off
5291 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
5292 .addMBB(MBB: &LoopBB);
5293 // clang-format on
5294
5295 return InsertPt->getIterator();
5296}
5297
5298// This has slightly sub-optimal regalloc when the source vector is killed by
5299// the read. The register allocator does not understand that the kill is
5300// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5301// subregister from it, using 1 more VGPR than necessary. This was saved when
5302// this was expanded after register allocation.
5303static MachineBasicBlock::iterator
5304loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
5305 unsigned InitResultReg, unsigned PhiReg, int Offset,
5306 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5307 MachineFunction *MF = MBB.getParent();
5308 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5309 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5310 MachineRegisterInfo &MRI = MF->getRegInfo();
5311 const DebugLoc &DL = MI.getDebugLoc();
5312 MachineBasicBlock::iterator I(&MI);
5313
5314 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5315 Register DstReg = MI.getOperand(i: 0).getReg();
5316 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5317 Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5318 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5319
5320 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec);
5321
5322 // Save the EXEC mask
5323 // clang-format off
5324 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExec)
5325 .addReg(RegNo: LMC.ExecReg);
5326 // clang-format on
5327
5328 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, InstInLoop: false);
5329
5330 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5331
5332 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx,
5333 InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
5334 Offset, UseGPRIdxMode, SGPRIdxReg);
5335
5336 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5337 MachineFunction::iterator MBBI(LoopBB);
5338 ++MBBI;
5339 MF->insert(MBBI, MBB: LandingPad);
5340 LoopBB->removeSuccessor(Succ: RemainderBB);
5341 LandingPad->addSuccessor(Succ: RemainderBB);
5342 LoopBB->addSuccessor(Succ: LandingPad);
5343 MachineBasicBlock::iterator First = LandingPad->begin();
5344 // clang-format off
5345 BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
5346 .addReg(RegNo: SaveExec);
5347 // clang-format on
5348
5349 return InsPt;
5350}
5351
5352// Returns subreg index, offset
5353static std::pair<unsigned, int>
5354computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
5355 const TargetRegisterClass *SuperRC, unsigned VecReg,
5356 int Offset) {
5357 int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / 32;
5358
5359 // Skip out of bounds offsets, or else we would end up using an undefined
5360 // register.
5361 if (Offset >= NumElts || Offset < 0)
5362 return std::pair(AMDGPU::sub0, Offset);
5363
5364 return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0);
5365}
5366
5367static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
5368 MachineRegisterInfo &MRI, MachineInstr &MI,
5369 int Offset) {
5370 MachineBasicBlock *MBB = MI.getParent();
5371 const DebugLoc &DL = MI.getDebugLoc();
5372 MachineBasicBlock::iterator I(&MI);
5373
5374 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5375
5376 assert(Idx->getReg() != AMDGPU::NoRegister);
5377
5378 if (Offset == 0) {
5379 // clang-format off
5380 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5381 .add(MO: *Idx);
5382 // clang-format on
5383 } else {
5384 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5385 .add(MO: *Idx)
5386 .addImm(Val: Offset);
5387 }
5388}
5389
5390static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
5391 MachineRegisterInfo &MRI, MachineInstr &MI,
5392 int Offset) {
5393 MachineBasicBlock *MBB = MI.getParent();
5394 const DebugLoc &DL = MI.getDebugLoc();
5395 MachineBasicBlock::iterator I(&MI);
5396
5397 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5398
5399 if (Offset == 0)
5400 return Idx->getReg();
5401
5402 Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5403 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp)
5404 .add(MO: *Idx)
5405 .addImm(Val: Offset);
5406 return Tmp;
5407}
5408
5409static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
5410 MachineBasicBlock &MBB,
5411 const GCNSubtarget &ST) {
5412 const SIInstrInfo *TII = ST.getInstrInfo();
5413 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5414 MachineFunction *MF = MBB.getParent();
5415 MachineRegisterInfo &MRI = MF->getRegInfo();
5416
5417 Register Dst = MI.getOperand(i: 0).getReg();
5418 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5419 Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg();
5420 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5421
5422 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
5423 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5424
5425 unsigned SubReg;
5426 std::tie(args&: SubReg, args&: Offset) =
5427 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
5428
5429 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5430
5431 // Check for a SGPR index.
5432 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5433 MachineBasicBlock::iterator I(&MI);
5434 const DebugLoc &DL = MI.getDebugLoc();
5435
5436 if (UseGPRIdxMode) {
5437 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5438 // to avoid interfering with other uses, so probably requires a new
5439 // optimization pass.
5440 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5441
5442 const MCInstrDesc &GPRIDXDesc =
5443 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
5444 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5445 .addReg(RegNo: SrcReg)
5446 .addReg(RegNo: Idx)
5447 .addImm(Val: SubReg);
5448 } else {
5449 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5450
5451 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5452 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
5453 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5454 }
5455
5456 MI.eraseFromParent();
5457
5458 return &MBB;
5459 }
5460
5461 // Control flow needs to be inserted if indexing with a VGPR.
5462 const DebugLoc &DL = MI.getDebugLoc();
5463 MachineBasicBlock::iterator I(&MI);
5464
5465 Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5466 Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5467
5468 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg);
5469
5470 Register SGPRIdxReg;
5471 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
5472 UseGPRIdxMode, SGPRIdxReg);
5473
5474 MachineBasicBlock *LoopBB = InsPt->getParent();
5475
5476 if (UseGPRIdxMode) {
5477 const MCInstrDesc &GPRIDXDesc =
5478 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
5479
5480 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5481 .addReg(RegNo: SrcReg)
5482 .addReg(RegNo: SGPRIdxReg)
5483 .addImm(Val: SubReg);
5484 } else {
5485 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5486 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
5487 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5488 }
5489
5490 MI.eraseFromParent();
5491
5492 return LoopBB;
5493}
5494
5495static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
5496 MachineBasicBlock &MBB,
5497 const GCNSubtarget &ST) {
5498 const SIInstrInfo *TII = ST.getInstrInfo();
5499 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5500 MachineFunction *MF = MBB.getParent();
5501 MachineRegisterInfo &MRI = MF->getRegInfo();
5502
5503 Register Dst = MI.getOperand(i: 0).getReg();
5504 const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src);
5505 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5506 const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val);
5507 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5508 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
5509 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5510
5511 // This can be an immediate, but will be folded later.
5512 assert(Val->getReg());
5513
5514 unsigned SubReg;
5515 std::tie(args&: SubReg, args&: Offset) =
5516 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcVec->getReg(), Offset);
5517 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5518
5519 if (Idx->getReg() == AMDGPU::NoRegister) {
5520 MachineBasicBlock::iterator I(&MI);
5521 const DebugLoc &DL = MI.getDebugLoc();
5522
5523 assert(Offset == 0);
5524
5525 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst)
5526 .add(MO: *SrcVec)
5527 .add(MO: *Val)
5528 .addImm(Val: SubReg);
5529
5530 MI.eraseFromParent();
5531 return &MBB;
5532 }
5533
5534 // Check for a SGPR index.
5535 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5536 MachineBasicBlock::iterator I(&MI);
5537 const DebugLoc &DL = MI.getDebugLoc();
5538
5539 if (UseGPRIdxMode) {
5540 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5541
5542 const MCInstrDesc &GPRIDXDesc =
5543 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5544 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5545 .addReg(RegNo: SrcVec->getReg())
5546 .add(MO: *Val)
5547 .addReg(RegNo: Idx)
5548 .addImm(Val: SubReg);
5549 } else {
5550 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5551
5552 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5553 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5554 BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5555 .addReg(RegNo: SrcVec->getReg())
5556 .add(MO: *Val)
5557 .addImm(Val: SubReg);
5558 }
5559 MI.eraseFromParent();
5560 return &MBB;
5561 }
5562
5563 // Control flow needs to be inserted if indexing with a VGPR.
5564 if (Val->isReg())
5565 MRI.clearKillFlags(Reg: Val->getReg());
5566
5567 const DebugLoc &DL = MI.getDebugLoc();
5568
5569 Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
5570
5571 Register SGPRIdxReg;
5572 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
5573 UseGPRIdxMode, SGPRIdxReg);
5574 MachineBasicBlock *LoopBB = InsPt->getParent();
5575
5576 if (UseGPRIdxMode) {
5577 const MCInstrDesc &GPRIDXDesc =
5578 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5579
5580 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5581 .addReg(RegNo: PhiReg)
5582 .add(MO: *Val)
5583 .addReg(RegNo: SGPRIdxReg)
5584 .addImm(Val: SubReg);
5585 } else {
5586 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5587 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5588 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5589 .addReg(RegNo: PhiReg)
5590 .add(MO: *Val)
5591 .addImm(Val: SubReg);
5592 }
5593
5594 MI.eraseFromParent();
5595 return LoopBB;
5596}
5597
5598static MachineBasicBlock *expand64BitScalarArithmetic(MachineInstr &MI,
5599 MachineBasicBlock *BB) {
5600 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5601 // For GFX12, we emit s_add_u64 and s_sub_u64.
5602 MachineFunction *MF = BB->getParent();
5603 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5604 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5605 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5606 const DebugLoc &DL = MI.getDebugLoc();
5607 MachineOperand &Dest = MI.getOperand(i: 0);
5608 MachineOperand &Src0 = MI.getOperand(i: 1);
5609 MachineOperand &Src1 = MI.getOperand(i: 2);
5610 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5611 if (ST.hasScalarAddSub64()) {
5612 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5613 // clang-format off
5614 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5615 .add(MO: Src0)
5616 .add(MO: Src1);
5617 // clang-format on
5618 } else {
5619 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5620 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5621
5622 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5623 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5624
5625 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5626 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5627 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5628 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5629
5630 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5631 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5632 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5633 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5634
5635 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5636 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5637 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0).add(MO: Src0Sub0).add(MO: Src1Sub0);
5638 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1).add(MO: Src0Sub1).add(MO: Src1Sub1);
5639 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5640 .addReg(RegNo: DestSub0)
5641 .addImm(Val: AMDGPU::sub0)
5642 .addReg(RegNo: DestSub1)
5643 .addImm(Val: AMDGPU::sub1);
5644 }
5645 MI.eraseFromParent();
5646 return BB;
5647}
5648
5649static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB) {
5650 MachineFunction *MF = BB->getParent();
5651 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5652 const SIInstrInfo *TII = ST.getInstrInfo();
5653 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5654 MachineRegisterInfo &MRI = MF->getRegInfo();
5655 const DebugLoc &DL = MI.getDebugLoc();
5656 Register Dst = MI.getOperand(i: 0).getReg();
5657 const MachineOperand &Src0 = MI.getOperand(i: 1);
5658 const MachineOperand &Src1 = MI.getOperand(i: 2);
5659 Register SrcCond = MI.getOperand(i: 3).getReg();
5660
5661 Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5662 Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5663 const TargetRegisterClass *CondRC = TRI->getWaveMaskRegClass();
5664 Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC);
5665
5666 int Src0Idx =
5667 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src0);
5668 int Src1Idx =
5669 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src1);
5670 const TargetRegisterClass *Src0RC =
5671 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: Src0Idx));
5672 const TargetRegisterClass *Src1RC =
5673 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: Src1Idx));
5674
5675 const TargetRegisterClass *Src0SubRC =
5676 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5677 const TargetRegisterClass *Src1SubRC =
5678 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5679
5680 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5681 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
5682 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5683 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5684
5685 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5686 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
5687 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5688 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5689
5690 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy).addReg(RegNo: SrcCond);
5691 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo)
5692 .addImm(Val: 0)
5693 .add(MO: Src0Sub0)
5694 .addImm(Val: 0)
5695 .add(MO: Src1Sub0)
5696 .addReg(RegNo: SrcCondCopy);
5697
5698 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi)
5699 .addImm(Val: 0)
5700 .add(MO: Src0Sub1)
5701 .addImm(Val: 0)
5702 .add(MO: Src1Sub1)
5703 .addReg(RegNo: SrcCondCopy);
5704
5705 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
5706 .addReg(RegNo: DstLo)
5707 .addImm(Val: AMDGPU::sub0)
5708 .addReg(RegNo: DstHi)
5709 .addImm(Val: AMDGPU::sub1);
5710 MI.eraseFromParent();
5711}
5712
5713static uint64_t getIdentityValueForWaveReduction(unsigned Opc) {
5714 switch (Opc) {
5715 case AMDGPU::S_MIN_U32:
5716 return std::numeric_limits<uint32_t>::max();
5717 case AMDGPU::S_MIN_I32:
5718 return std::numeric_limits<int32_t>::max();
5719 case AMDGPU::S_MAX_U32:
5720 return std::numeric_limits<uint32_t>::min();
5721 case AMDGPU::S_MAX_I32:
5722 return std::numeric_limits<int32_t>::min();
5723 case AMDGPU::V_ADD_F32_e64: // -0.0
5724 return 0x80000000;
5725 case AMDGPU::V_SUB_F32_e64: // +0.0
5726 return 0x0;
5727 case AMDGPU::S_ADD_I32:
5728 case AMDGPU::S_SUB_I32:
5729 case AMDGPU::S_OR_B32:
5730 case AMDGPU::S_XOR_B32:
5731 return std::numeric_limits<uint32_t>::min();
5732 case AMDGPU::S_AND_B32:
5733 return std::numeric_limits<uint32_t>::max();
5734 case AMDGPU::V_MIN_F32_e64:
5735 case AMDGPU::V_MAX_F32_e64:
5736 return 0x7fc00000; // qNAN
5737 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5738 return std::numeric_limits<uint64_t>::max();
5739 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5740 return std::numeric_limits<int64_t>::max();
5741 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5742 return std::numeric_limits<uint64_t>::min();
5743 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5744 return std::numeric_limits<int64_t>::min();
5745 case AMDGPU::V_MIN_F64_e64:
5746 case AMDGPU::V_MAX_F64_e64:
5747 case AMDGPU::V_MIN_NUM_F64_e64:
5748 case AMDGPU::V_MAX_NUM_F64_e64:
5749 return 0x7FF8000000000000; // qNAN
5750 case AMDGPU::S_ADD_U64_PSEUDO:
5751 case AMDGPU::S_SUB_U64_PSEUDO:
5752 case AMDGPU::S_OR_B64:
5753 case AMDGPU::S_XOR_B64:
5754 return std::numeric_limits<uint64_t>::min();
5755 case AMDGPU::S_AND_B64:
5756 return std::numeric_limits<uint64_t>::max();
5757 case AMDGPU::V_ADD_F64_e64:
5758 case AMDGPU::V_ADD_F64_pseudo_e64:
5759 return 0x8000000000000000; // -0.0
5760 default:
5761 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5762 }
5763}
5764
5765static bool is32bitWaveReduceOperation(unsigned Opc) {
5766 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5767 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5768 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5769 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5770 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5771 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5772 Opc == AMDGPU::V_SUB_F32_e64;
5773}
5774
5775static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5776 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5777 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5778 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5779 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5780 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5781}
5782
5783static std::tuple<unsigned, unsigned>
5784getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
5785 unsigned DPPOpc;
5786 switch (Opc) {
5787 case AMDGPU::S_MIN_U32:
5788 DPPOpc = AMDGPU::V_MIN_U32_dpp;
5789 break;
5790 case AMDGPU::S_MIN_I32:
5791 DPPOpc = AMDGPU::V_MIN_I32_dpp;
5792 break;
5793 case AMDGPU::S_MAX_U32:
5794 DPPOpc = AMDGPU::V_MAX_U32_dpp;
5795 break;
5796 case AMDGPU::S_MAX_I32:
5797 DPPOpc = AMDGPU::V_MAX_I32_dpp;
5798 break;
5799 case AMDGPU::S_ADD_I32:
5800 case AMDGPU::S_SUB_I32:
5801 DPPOpc = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5802 : AMDGPU::V_ADD_CO_U32_dpp;
5803 break;
5804 case AMDGPU::S_AND_B32:
5805 DPPOpc = AMDGPU::V_AND_B32_dpp;
5806 break;
5807 case AMDGPU::S_OR_B32:
5808 DPPOpc = AMDGPU::V_OR_B32_dpp;
5809 break;
5810 case AMDGPU::S_XOR_B32:
5811 DPPOpc = AMDGPU::V_XOR_B32_dpp;
5812 break;
5813 case AMDGPU::V_ADD_F32_e64:
5814 case AMDGPU::V_SUB_F32_e64:
5815 DPPOpc = AMDGPU::V_ADD_F32_dpp;
5816 break;
5817 case AMDGPU::V_MIN_F32_e64:
5818 DPPOpc = AMDGPU::V_MIN_F32_dpp;
5819 break;
5820 case AMDGPU::V_MAX_F32_e64:
5821 DPPOpc = AMDGPU::V_MAX_F32_dpp;
5822 break;
5823 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5824 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5825 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5826 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5827 case AMDGPU::S_ADD_U64_PSEUDO:
5828 case AMDGPU::S_SUB_U64_PSEUDO:
5829 case AMDGPU::S_AND_B64:
5830 case AMDGPU::S_OR_B64:
5831 case AMDGPU::S_XOR_B64:
5832 case AMDGPU::V_MIN_NUM_F64_e64:
5833 case AMDGPU::V_MIN_F64_e64:
5834 case AMDGPU::V_MAX_NUM_F64_e64:
5835 case AMDGPU::V_MAX_F64_e64:
5836 case AMDGPU::V_ADD_F64_pseudo_e64:
5837 case AMDGPU::V_ADD_F64_e64:
5838 DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
5839 break;
5840 default:
5841 llvm_unreachable("unhandled lane op");
5842 }
5843 unsigned ClampOpc = Opc;
5844 if (!ST.getInstrInfo()->isVALU(Opcode: Opc, /*AllowLDSDMA=*/true)) {
5845 if (Opc == AMDGPU::S_SUB_I32)
5846 ClampOpc = AMDGPU::S_ADD_I32;
5847 if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
5848 ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
5849 else if (Opc == AMDGPU::S_AND_B64)
5850 ClampOpc = AMDGPU::V_AND_B32_e64;
5851 else if (Opc == AMDGPU::S_OR_B64)
5852 ClampOpc = AMDGPU::V_OR_B32_e64;
5853 else if (Opc == AMDGPU::S_XOR_B64)
5854 ClampOpc = AMDGPU::V_XOR_B32_e64;
5855 else
5856 ClampOpc = ST.getInstrInfo()->getVALUOp(Opc: ClampOpc);
5857 }
5858 return {DPPOpc, ClampOpc};
5859}
5860
5861static std::pair<Register, Register>
5862ExtractSubRegs(MachineInstr &MI, MachineOperand &Op,
5863 const TargetRegisterClass *SrcRC, const GCNSubtarget &ST,
5864 MachineRegisterInfo &MRI) {
5865 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5866 const SIInstrInfo *TII = ST.getInstrInfo();
5867 const TargetRegisterClass *SrcSubRC =
5868 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5869 Register Op1L =
5870 TII->buildExtractSubReg(MI, MRI, SuperReg: Op, SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
5871 Register Op1H =
5872 TII->buildExtractSubReg(MI, MRI, SuperReg: Op, SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
5873 return {Op1L, Op1H};
5874}
5875
5876static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5877 MachineBasicBlock &BB,
5878 const GCNSubtarget &ST,
5879 unsigned Opc) {
5880 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5881 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5882 const DebugLoc &DL = MI.getDebugLoc();
5883 const SIInstrInfo *TII = ST.getInstrInfo();
5884
5885 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5886 Register SrcReg = MI.getOperand(i: 1).getReg();
5887 bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
5888 Register DstReg = MI.getOperand(i: 0).getReg();
5889 unsigned Stratergy = static_cast<unsigned>(MI.getOperand(i: 2).getImm());
5890 enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 };
5891 MachineBasicBlock *RetBB = nullptr;
5892 unsigned MIOpc = MI.getOpcode();
5893 auto BuildRegSequence = [&](MachineBasicBlock &BB,
5894 MachineBasicBlock::iterator MI, Register Dst,
5895 Register Src0, Register Src1) {
5896 auto RegSequence =
5897 BuildMI(BB, I: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dst)
5898 .addReg(RegNo: Src0)
5899 .addImm(Val: AMDGPU::sub0)
5900 .addReg(RegNo: Src1)
5901 .addImm(Val: AMDGPU::sub1);
5902 return RegSequence;
5903 };
5904 if (isSGPR) {
5905 switch (Opc) {
5906 case AMDGPU::S_MIN_U32:
5907 case AMDGPU::S_MIN_I32:
5908 case AMDGPU::V_MIN_F32_e64:
5909 case AMDGPU::S_MAX_U32:
5910 case AMDGPU::S_MAX_I32:
5911 case AMDGPU::V_MAX_F32_e64:
5912 case AMDGPU::S_AND_B32:
5913 case AMDGPU::S_OR_B32: {
5914 // Idempotent operations.
5915 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg);
5916 RetBB = &BB;
5917 break;
5918 }
5919 case AMDGPU::V_CMP_LT_U64_e64: // umin
5920 case AMDGPU::V_CMP_LT_I64_e64: // min
5921 case AMDGPU::V_CMP_GT_U64_e64: // umax
5922 case AMDGPU::V_CMP_GT_I64_e64: // max
5923 case AMDGPU::V_MIN_F64_e64:
5924 case AMDGPU::V_MIN_NUM_F64_e64:
5925 case AMDGPU::V_MAX_F64_e64:
5926 case AMDGPU::V_MAX_NUM_F64_e64:
5927 case AMDGPU::S_AND_B64:
5928 case AMDGPU::S_OR_B64: {
5929 // Idempotent operations.
5930 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg).addReg(RegNo: SrcReg);
5931 RetBB = &BB;
5932 break;
5933 }
5934 case AMDGPU::S_XOR_B32:
5935 case AMDGPU::S_XOR_B64:
5936 case AMDGPU::S_ADD_I32:
5937 case AMDGPU::S_ADD_U64_PSEUDO:
5938 case AMDGPU::V_ADD_F32_e64:
5939 case AMDGPU::V_ADD_F64_e64:
5940 case AMDGPU::V_ADD_F64_pseudo_e64:
5941 case AMDGPU::S_SUB_I32:
5942 case AMDGPU::S_SUB_U64_PSEUDO:
5943 case AMDGPU::V_SUB_F32_e64: {
5944 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5945 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5946 Register ExecMask = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5947 Register NumActiveLanes =
5948 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5949
5950 bool IsWave32 = ST.isWave32();
5951 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5952 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5953 unsigned BitCountOpc =
5954 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5955
5956 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: ExecMask).addReg(RegNo: ExecReg);
5957
5958 auto NewAccumulator =
5959 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: BitCountOpc), DestReg: NumActiveLanes)
5960 .addReg(RegNo: ExecMask);
5961
5962 switch (Opc) {
5963 case AMDGPU::S_XOR_B32:
5964 case AMDGPU::S_XOR_B64: {
5965 // Performing an XOR operation on a uniform value
5966 // depends on the parity of the number of active lanes.
5967 // For even parity, the result will be 0, for odd
5968 // parity the result will be the same as the input value.
5969 Register ParityRegister =
5970 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5971
5972 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: ParityRegister)
5973 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5974 .addImm(Val: 1)
5975 .setOperandDead(3); // Dead scc
5976 if (Opc == AMDGPU::S_XOR_B32) {
5977 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5978 .addReg(RegNo: SrcReg)
5979 .addReg(RegNo: ParityRegister);
5980 } else {
5981 Register DestSub0 =
5982 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5983 Register DestSub1 =
5984 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5985 auto [Op1L, Op1H] = ExtractSubRegs(MI, Op&: MI.getOperand(i: 1),
5986 SrcRC: MRI.getRegClass(Reg: SrcReg), ST, MRI);
5987 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5988 .addReg(RegNo: Op1L)
5989 .addReg(RegNo: ParityRegister);
5990 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub1)
5991 .addReg(RegNo: Op1H)
5992 .addReg(RegNo: ParityRegister);
5993 BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);
5994 }
5995 break;
5996 }
5997 case AMDGPU::S_SUB_I32: {
5998 Register NegatedVal = MRI.createVirtualRegister(RegClass: DstRegClass);
5999
6000 // Take the negation of the source operand.
6001 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedVal)
6002 .addImm(Val: 0)
6003 .addReg(RegNo: SrcReg);
6004 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
6005 .addReg(RegNo: NegatedVal)
6006 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
6007 break;
6008 }
6009 case AMDGPU::S_ADD_I32: {
6010 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
6011 .addReg(RegNo: SrcReg)
6012 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
6013 break;
6014 }
6015 case AMDGPU::S_ADD_U64_PSEUDO:
6016 case AMDGPU::S_SUB_U64_PSEUDO: {
6017 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6018 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6019 Register Op1H_Op0L_Reg =
6020 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6021 Register Op1L_Op0H_Reg =
6022 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6023 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6024 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6025 Register NegatedValLo =
6026 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6027 Register NegatedValHi =
6028 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6029 auto [Op1L, Op1H] = ExtractSubRegs(MI, Op&: MI.getOperand(i: 1),
6030 SrcRC: MRI.getRegClass(Reg: SrcReg), ST, MRI);
6031 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6032 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedValLo)
6033 .addImm(Val: 0)
6034 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
6035 .setOperandDead(3); // Dead scc
6036 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ASHR_I32), DestReg: NegatedValHi)
6037 .addReg(RegNo: NegatedValLo)
6038 .addImm(Val: 31)
6039 .setOperandDead(3); // Dead scc
6040 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1L_Op0H_Reg)
6041 .addReg(RegNo: Op1L)
6042 .addReg(RegNo: NegatedValHi);
6043 }
6044 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
6045 ? NegatedValLo
6046 : NewAccumulator->getOperand(i: 0).getReg();
6047 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
6048 .addReg(RegNo: Op1L)
6049 .addReg(RegNo: LowOpcode);
6050 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_HI_U32), DestReg: CarryReg)
6051 .addReg(RegNo: Op1L)
6052 .addReg(RegNo: LowOpcode);
6053 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1H_Op0L_Reg)
6054 .addReg(RegNo: Op1H)
6055 .addReg(RegNo: LowOpcode);
6056
6057 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
6058 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: HiVal)
6059 .addReg(RegNo: CarryReg)
6060 .addReg(RegNo: Op1H_Op0L_Reg)
6061 .setOperandDead(3); // Dead scc
6062
6063 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6064 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: DestSub1)
6065 .addReg(RegNo: HiVal)
6066 .addReg(RegNo: Op1L_Op0H_Reg)
6067 .setOperandDead(3); // Dead scc
6068 }
6069 BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);
6070 break;
6071 }
6072 case AMDGPU::V_ADD_F32_e64:
6073 case AMDGPU::V_ADD_F64_e64:
6074 case AMDGPU::V_ADD_F64_pseudo_e64:
6075 case AMDGPU::V_SUB_F32_e64: {
6076 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
6077 const TargetRegisterClass *VregRC = TII->getRegClass(MCID: TII->get(Opcode: Opc), OpNum: 0);
6078 Register ActiveLanesVreg = MRI.createVirtualRegister(RegClass: VregRC);
6079 Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
6080 // Get number of active lanes as a float val.
6081 BuildMI(BB, I&: MI, MIMD: DL,
6082 MCID: TII->get(Opcode: is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
6083 : AMDGPU::V_CVT_F64_I32_e64),
6084 DestReg: ActiveLanesVreg)
6085 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
6086 .addImm(Val: 0) // clamp
6087 .addImm(Val: 0); // output-modifier
6088
6089 // Take negation of input for SUB reduction
6090 unsigned srcMod = (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6091 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
6092 ? SISrcMods::NEG
6093 : SISrcMods::NONE;
6094 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
6095 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
6096 ? AMDGPU::V_MUL_F64_pseudo_e64
6097 : AMDGPU::V_MUL_F64_e64;
6098 auto DestVregInst = BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MulOpc),
6099 DestReg: DstVreg)
6100 .addImm(Val: srcMod) // src0 modifier
6101 .addReg(RegNo: SrcReg)
6102 .addImm(Val: SISrcMods::NONE) // src1 modifier
6103 .addReg(RegNo: ActiveLanesVreg)
6104 .addImm(Val: SISrcMods::NONE) // clamp
6105 .addImm(Val: SISrcMods::NONE); // output-mod
6106 if (is32BitOpc) {
6107 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6108 .addReg(RegNo: DstVreg);
6109 } else {
6110 Register LaneValueLoReg =
6111 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6112 Register LaneValueHiReg =
6113 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6114 auto [Op1L, Op1H] =
6115 ExtractSubRegs(MI, Op&: DestVregInst->getOperand(i: 0), SrcRC: VregRC, ST, MRI);
6116 // lane value input should be in an sgpr
6117 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6118 DestReg: LaneValueLoReg)
6119 .addReg(RegNo: Op1L);
6120 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6121 DestReg: LaneValueHiReg)
6122 .addReg(RegNo: Op1H);
6123 NewAccumulator =
6124 BuildRegSequence(BB, MI, DstReg, LaneValueLoReg, LaneValueHiReg);
6125 }
6126 }
6127 }
6128 RetBB = &BB;
6129 }
6130 }
6131 } else {
6132 MachineBasicBlock::iterator I = BB.end();
6133 Register SrcReg = MI.getOperand(i: 1).getReg();
6134 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
6135 bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
6136 bool NeedsMovDPP = !is32BitOpc;
6137 // Create virtual registers required for lowering.
6138 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
6139 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
6140 const TargetRegisterClass *SrcRegClass = MRI.getRegClass(Reg: SrcReg);
6141 bool IsWave32 = ST.isWave32();
6142 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6143 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6144 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
6145 !ST.hasDPP()) { // If target doesn't support DPP operations, default to
6146 // iterative stratergy
6147
6148 // To reduce the VGPR using iterative approach, we need to iterate
6149 // over all the active lanes. Lowering consists of ComputeLoop,
6150 // which iterate over only active lanes. We use copy of EXEC register
6151 // as induction variable and every active lane modifies it using bitset0
6152 // so that we will get the next active lane for next iteration.
6153
6154 // Create Control flow for loop
6155 // Split MI's Machine Basic block into For loop
6156 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
6157
6158 Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6159 Register IdentityValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
6160 Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
6161 Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6162 Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6163 Register FF1Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6164 Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass);
6165
6166 // Create initial values of induction variable from Exec, Accumulator and
6167 // insert branch instr to newly created ComputeBlock
6168 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpcForExec), DestReg: LoopIterator).addReg(RegNo: ExecReg);
6169 uint64_t IdentityValue =
6170 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6171 ? 0x0 // +0.0 for double sub reduction
6172 : getIdentityValueForWaveReduction(Opc);
6173 BuildMI(BB, I, MIMD: DL,
6174 MCID: TII->get(Opcode: is32BitOpc ? AMDGPU::S_MOV_B32
6175 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6176 DestReg: IdentityValReg)
6177 .addImm(Val: IdentityValue);
6178 // clang-format off
6179 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
6180 .addMBB(MBB: ComputeLoop);
6181 // clang-format on
6182
6183 // Start constructing ComputeLoop
6184 I = ComputeLoop->begin();
6185 auto Accumulator =
6186 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg)
6187 .addReg(RegNo: IdentityValReg)
6188 .addMBB(MBB: &BB);
6189 auto ActiveBits =
6190 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg)
6191 .addReg(RegNo: LoopIterator)
6192 .addMBB(MBB: &BB);
6193
6194 I = ComputeLoop->end();
6195 MachineInstr *NewAccumulator;
6196 // Perform the computations
6197 unsigned SFFOpc =
6198 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6199 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg)
6200 .addReg(RegNo: ActiveBitsReg);
6201 if (is32BitOpc) {
6202 Register OpDstReg = DstReg;
6203 bool hasSrc0Modifier = AMDGPU::getNamedOperandIdx(
6204 Opcode: Opc, Name: AMDGPU::OpName::src0_modifiers) != -1;
6205 bool hasSrc1Modifier = AMDGPU::getNamedOperandIdx(
6206 Opcode: Opc, Name: AMDGPU::OpName::src1_modifiers) != -1;
6207 bool hasClamp =
6208 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::clamp) != -1;
6209 bool hasOpSel =
6210 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::op_sel) != -1;
6211 bool hasOMod =
6212 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::omod) != -1;
6213 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6214 DestReg: LaneValueReg)
6215 .addReg(RegNo: SrcReg)
6216 .addReg(RegNo: FF1Reg);
6217 if (ST.getInstrInfo()->isVALU(Opcode: Opc, /*AllowLDSDMA=*/true)) {
6218 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
6219 Register LaneValVgpr = MRI.createVirtualRegister(RegClass: SrcRegClass);
6220 Register VgprResultReg = MRI.createVirtualRegister(RegClass: SrcRegClass);
6221 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: LaneValVgpr)
6222 .addReg(RegNo: LaneValueReg);
6223 OpDstReg = VgprResultReg;
6224 LaneValueReg = LaneValVgpr;
6225 }
6226 auto OpInstr = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: OpDstReg);
6227 if (hasSrc0Modifier)
6228 OpInstr.addImm(Val: SISrcMods::NONE); // src0 modifier
6229 OpInstr.addReg(RegNo: AccumulatorReg); // src0
6230 if (hasSrc1Modifier)
6231 OpInstr.addImm(Val: SISrcMods::NONE); // src1 modifier
6232 OpInstr.addReg(RegNo: LaneValueReg); // src1
6233 if (hasClamp)
6234 OpInstr.addImm(Val: 0); // clamp
6235 if (hasOpSel)
6236 OpInstr.addImm(Val: 0); // opsel
6237 if (hasOMod)
6238 OpInstr.addImm(Val: 0); // omod
6239 if (ST.getInstrInfo()->isVALU(Opcode: Opc, /*AllowLDSDMA=*/true)) {
6240 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6241 DestReg: DstReg)
6242 .addReg(RegNo: OpDstReg);
6243 }
6244 } else {
6245 Register LaneValueLoReg =
6246 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6247 Register LaneValueHiReg =
6248 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6249 Register LaneValReg =
6250 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6251 auto [Op1L, Op1H] = ExtractSubRegs(MI, Op&: MI.getOperand(i: 1),
6252 SrcRC: MRI.getRegClass(Reg: SrcReg), ST, MRI);
6253 // lane value input should be in an sgpr
6254 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6255 DestReg: LaneValueLoReg)
6256 .addReg(RegNo: Op1L)
6257 .addReg(RegNo: FF1Reg);
6258 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6259 DestReg: LaneValueHiReg)
6260 .addReg(RegNo: Op1H)
6261 .addReg(RegNo: FF1Reg);
6262 auto LaneValue = BuildRegSequence(*ComputeLoop, I, LaneValReg,
6263 LaneValueLoReg, LaneValueHiReg);
6264 switch (Opc) {
6265 case AMDGPU::S_OR_B64:
6266 case AMDGPU::S_AND_B64:
6267 case AMDGPU::S_XOR_B64: {
6268 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6269 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6270 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6271 .setOperandDead(3); // Dead scc
6272 break;
6273 }
6274 case AMDGPU::V_CMP_GT_I64_e64:
6275 case AMDGPU::V_CMP_GT_U64_e64:
6276 case AMDGPU::V_CMP_LT_I64_e64:
6277 case AMDGPU::V_CMP_LT_U64_e64: {
6278 Register LaneMaskReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6279 Register ComparisonResultReg =
6280 MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6281 int SrcIdx =
6282 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6283 const TargetRegisterClass *VregClass =
6284 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6285 Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregClass);
6286 auto [SrcReg0Sub0, SrcReg0Sub1] = ExtractSubRegs(
6287 MI, Op&: Accumulator->getOperand(i: 0), SrcRC: VregClass, ST, MRI);
6288 BuildRegSequence(*ComputeLoop, I, AccumulatorVReg, SrcReg0Sub0,
6289 SrcReg0Sub1);
6290 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: LaneMaskReg)
6291 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6292 .addReg(RegNo: AccumulatorVReg);
6293
6294 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6295 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: ComparisonResultReg)
6296 .addReg(RegNo: LaneMaskReg)
6297 .addReg(RegNo: ActiveBitsReg);
6298
6299 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6300 MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
6301 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6302 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg());
6303 break;
6304 }
6305 case AMDGPU::V_MIN_F64_e64:
6306 case AMDGPU::V_MIN_NUM_F64_e64:
6307 case AMDGPU::V_MAX_F64_e64:
6308 case AMDGPU::V_MAX_NUM_F64_e64:
6309 case AMDGPU::V_ADD_F64_e64:
6310 case AMDGPU::V_ADD_F64_pseudo_e64: {
6311 int SrcIdx =
6312 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6313 const TargetRegisterClass *VregRC =
6314 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6315 Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregRC);
6316 Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
6317 Register LaneValLo =
6318 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6319 Register LaneValHi =
6320 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6321 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AccumulatorVReg)
6322 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg());
6323 unsigned Modifier =
6324 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6325 ? SISrcMods::NEG
6326 : SISrcMods::NONE;
6327 auto DstVregInst =
6328 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
6329 .addImm(Val: Modifier) // src0 modifiers
6330 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6331 .addImm(Val: SISrcMods::NONE) // src1 modifiers
6332 .addReg(RegNo: AccumulatorVReg)
6333 .addImm(Val: SISrcMods::NONE) // clamp
6334 .addImm(Val: SISrcMods::NONE); // omod
6335 auto ReadLaneLo =
6336 BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6337 MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: LaneValLo);
6338 auto ReadLaneHi =
6339 BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6340 MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: LaneValHi);
6341 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6342 auto [Op1L, Op1H] = ExtractSubRegs(MI&: *Iters, Op&: DstVregInst->getOperand(i: 0),
6343 SrcRC: VregRC, ST, MRI);
6344 ReadLaneLo.addReg(RegNo: Op1L);
6345 ReadLaneHi.addReg(RegNo: Op1H);
6346 NewAccumulator =
6347 BuildRegSequence(*ComputeLoop, I, DstReg, LaneValLo, LaneValHi);
6348 break;
6349 }
6350 case AMDGPU::S_ADD_U64_PSEUDO:
6351 case AMDGPU::S_SUB_U64_PSEUDO: {
6352 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6353 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6354 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg());
6355 ComputeLoop =
6356 expand64BitScalarArithmetic(MI&: *NewAccumulator, BB: ComputeLoop);
6357 break;
6358 }
6359 }
6360 }
6361 // Manipulate the iterator to get the next active lane
6362 unsigned BITSETOpc =
6363 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6364 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg)
6365 .addReg(RegNo: FF1Reg)
6366 .addReg(RegNo: ActiveBitsReg);
6367
6368 // Add phi nodes
6369 Accumulator.addReg(RegNo: DstReg).addMBB(MBB: ComputeLoop);
6370 ActiveBits.addReg(RegNo: NewActiveBitsReg).addMBB(MBB: ComputeLoop);
6371
6372 // Creating branching
6373 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6374 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc))
6375 .addReg(RegNo: NewActiveBitsReg)
6376 .addImm(Val: 0);
6377 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6378 .addMBB(MBB: ComputeLoop);
6379
6380 RetBB = ComputeEnd;
6381 } else {
6382 assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
6383 MachineBasicBlock *CurrBB = &BB;
6384 Register SrcWithIdentity = MRI.createVirtualRegister(RegClass: SrcRegClass);
6385 Register IdentityVGPR = MRI.createVirtualRegister(RegClass: SrcRegClass);
6386 Register IdentitySGPR = MRI.createVirtualRegister(RegClass: DstRegClass);
6387 Register DPPRowShr1 = MRI.createVirtualRegister(RegClass: SrcRegClass);
6388 Register DPPRowShr2 = MRI.createVirtualRegister(RegClass: SrcRegClass);
6389 Register DPPRowShr4 = MRI.createVirtualRegister(RegClass: SrcRegClass);
6390 Register DPPRowShr8 = MRI.createVirtualRegister(RegClass: SrcRegClass);
6391 Register RowBcast15 = MRI.createVirtualRegister(RegClass: SrcRegClass);
6392 Register ReducedValSGPR = MRI.createVirtualRegister(RegClass: DstRegClass);
6393 Register NegatedReducedVal = MRI.createVirtualRegister(RegClass: DstRegClass);
6394 Register RowBcast31 = MRI.createVirtualRegister(RegClass: SrcRegClass);
6395 Register UndefExec = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6396 Register FinalDPPResult;
6397 MachineInstr *SrcWithIdentityInstr;
6398 MachineInstr *LastBcastInstr;
6399 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: UndefExec);
6400
6401 uint64_t IdentityValue = getIdentityValueForWaveReduction(Opc);
6402 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL,
6403 MCID: TII->get(Opcode: is32BitOpc ? AMDGPU::S_MOV_B32
6404 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6405 DestReg: IdentitySGPR)
6406 .addImm(Val: IdentityValue);
6407 auto IdentityCopyInstr =
6408 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: IdentityVGPR)
6409 .addReg(RegNo: IdentitySGPR);
6410 auto DPPClampOpcPair = getDPPOpcForWaveReduction(Opc, ST);
6411 unsigned DPPOpc = std::get<0>(t&: DPPClampOpcPair);
6412 unsigned ClampOpc = std::get<1>(t&: DPPClampOpcPair);
6413 auto BuildSetInactiveInstr = [&](Register Dst, Register Src0,
6414 Register Src1) {
6415 return BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_SET_INACTIVE_B32),
6416 DestReg: Dst)
6417 .addImm(Val: 0) // src0 modifiers
6418 .addReg(RegNo: Src0) // src0
6419 .addImm(Val: 0) // src1 modifiers
6420 .addReg(RegNo: Src1) // identity value for inactive lanes
6421 .addReg(RegNo: UndefExec); // bool i1
6422 };
6423 auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
6424 unsigned DPPCtrl) {
6425 auto DPPInstr =
6426 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: DPPOpc), DestReg: Dst).addReg(RegNo: Src); // old
6427 if (isFPOp && !NeedsMovDPP)
6428 DPPInstr.addImm(Val: SISrcMods::NONE); // src0 modifier
6429 DPPInstr.addReg(RegNo: Src); // src0
6430 if (isFPOp && !NeedsMovDPP)
6431 DPPInstr.addImm(Val: SISrcMods::NONE); // src1 modifier
6432 if (!NeedsMovDPP)
6433 DPPInstr.addReg(RegNo: Src); // src1
6434 if (AMDGPU::getNamedOperandIdx(Opcode: DPPOpc, Name: AMDGPU::OpName::clamp) >= 0)
6435 DPPInstr.addImm(Val: 0); // clamp
6436 DPPInstr
6437 .addImm(Val: DPPCtrl) // dpp-ctrl
6438 .addImm(Val: 0xf) // row-mask
6439 .addImm(Val: 0xf) // bank-mask
6440 .addImm(Val: 0); // bound-control
6441 };
6442 auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1,
6443 bool isAddSub = false,
6444 bool needsCarryIn = false,
6445 Register CarryIn = Register()) {
6446 unsigned InstrOpc = ClampOpc;
6447 Register CarryOutReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6448 if (needsCarryIn)
6449 InstrOpc = AMDGPU::V_ADDC_U32_e64;
6450 auto ClampInstr = BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: InstrOpc), DestReg: Dst);
6451 if (isFPOp)
6452 ClampInstr.addImm(Val: SISrcMods::NONE); // src0 mod
6453 if (isAddSub) {
6454 if (needsCarryIn)
6455 ClampInstr.addReg(RegNo: CarryOutReg,
6456 Flags: RegState::Define |
6457 RegState::Dead); // killed carry-out reg
6458 else
6459 ClampInstr.addReg(RegNo: CarryOutReg, Flags: RegState::Define); // carry-out reg
6460 }
6461 ClampInstr.addReg(RegNo: Src0); // src0
6462 if (isFPOp)
6463 ClampInstr.addImm(Val: SISrcMods::NONE); // src1 mod
6464 ClampInstr.addReg(RegNo: Src1); // src1
6465 if (needsCarryIn)
6466 ClampInstr.addReg(RegNo: CarryIn, Flags: RegState::Kill); // carry-in reg
6467 if (AMDGPU::getNamedOperandIdx(Opcode: InstrOpc, Name: AMDGPU::OpName::clamp) >= 0)
6468 ClampInstr.addImm(Val: 0); // clamp
6469 if (isFPOp)
6470 ClampInstr.addImm(Val: 0); // omod
6471 LastBcastInstr = ClampInstr;
6472 return CarryOutReg;
6473 };
6474 auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
6475 bool isAddSubOpc =
6476 Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
6477 bool isBitWiseOpc = Opc == AMDGPU::S_AND_B64 ||
6478 Opc == AMDGPU::S_OR_B64 || Opc == AMDGPU::S_XOR_B64;
6479 Register ReturnReg = MRI.createVirtualRegister(RegClass: SrcRegClass);
6480 if (isAddSubOpc || isBitWiseOpc) {
6481 Register ResLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6482 Register ResHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6483 MachineOperand Src0Operand =
6484 MachineOperand::CreateReg(Reg: Src0, /*isDef=*/false);
6485 MachineOperand Src1Operand =
6486 MachineOperand::CreateReg(Reg: Src1, /*isDef=*/false);
6487 auto [Src0Lo, Src0Hi] =
6488 ExtractSubRegs(MI, Op&: Src0Operand, SrcRC: SrcRegClass, ST, MRI);
6489 auto [Src1Lo, Src1Hi] =
6490 ExtractSubRegs(MI, Op&: Src1Operand, SrcRC: SrcRegClass, ST, MRI);
6491 Register CarryReg = BuildClampInstr(
6492 ResLo, Src0Lo, Src1Lo, isAddSubOpc, /*needsCarryIn*/ false);
6493 BuildClampInstr(ResHi, Src0Hi, Src1Hi, isAddSubOpc,
6494 /*needsCarryIn*/ isAddSubOpc, CarryReg);
6495 BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
6496 } else {
6497 if (isFPOp) {
6498 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: ReturnReg)
6499 .addImm(Val: SISrcMods::NONE) // src0 modifiers
6500 .addReg(RegNo: Src0)
6501 .addImm(Val: SISrcMods::NONE) // src1 modifiers
6502 .addReg(RegNo: Src1)
6503 .addImm(Val: SISrcMods::NONE) // clamp
6504 .addImm(Val: SISrcMods::NONE); // omod
6505 } else {
6506 Register CmpMaskReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6507 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: CmpMaskReg)
6508 .addReg(RegNo: Src0) // src0
6509 .addReg(RegNo: Src1); // src1
6510 LastBcastInstr =
6511 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B64_PSEUDO),
6512 DestReg: ReturnReg)
6513 .addReg(RegNo: Src1) // src0
6514 .addReg(RegNo: Src0) // src1
6515 .addReg(RegNo: CmpMaskReg); // src2
6516 expand64BitV_CNDMASK(MI&: *LastBcastInstr, BB: CurrBB);
6517 }
6518 }
6519 return ReturnReg;
6520 };
6521
6522 // Set inactive lanes to the identity value.
6523 if (is32BitOpc) {
6524 SrcWithIdentityInstr =
6525 BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
6526 } else {
6527 Register SrcWithIdentitylo =
6528 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6529 Register SrcWithIdentityhi =
6530 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6531 auto [Reg0Sub0, Reg0Sub1] = ExtractSubRegs(
6532 MI, Op&: IdentityCopyInstr->getOperand(i: 0), SrcRC: SrcRegClass, ST, MRI);
6533 auto [SrcReg0Sub0, SrcReg0Sub1] =
6534 ExtractSubRegs(MI, Op&: MI.getOperand(i: 1), SrcRC: SrcRegClass, ST, MRI);
6535 MachineInstr *SetInactiveLoInstr =
6536 BuildSetInactiveInstr(SrcWithIdentitylo, SrcReg0Sub0, Reg0Sub0);
6537 MachineInstr *SetInactiveHiInstr =
6538 BuildSetInactiveInstr(SrcWithIdentityhi, SrcReg0Sub1, Reg0Sub1);
6539 SrcWithIdentityInstr =
6540 BuildRegSequence(*CurrBB, MI, SrcWithIdentity,
6541 SetInactiveLoInstr->getOperand(i: 0).getReg(),
6542 SetInactiveHiInstr->getOperand(i: 0).getReg());
6543 }
6544 // DPP reduction
6545 Register SrcWithIdentityReg =
6546 SrcWithIdentityInstr->getOperand(i: 0).getReg();
6547 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
6548 AMDGPU::DPP::ROW_SHR_FIRST);
6549 if (NeedsMovDPP)
6550 DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
6551
6552 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6553 (AMDGPU::DPP::ROW_SHR_FIRST + 1));
6554 if (NeedsMovDPP)
6555 DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
6556
6557 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6558 (AMDGPU::DPP::ROW_SHR_FIRST + 3));
6559 if (NeedsMovDPP)
6560 DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
6561
6562 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6563 (AMDGPU::DPP::ROW_SHR_FIRST + 7));
6564 if (NeedsMovDPP)
6565 DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
6566
6567 if (ST.hasDPPBroadcasts()) {
6568 BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);
6569 if (NeedsMovDPP)
6570 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
6571 } else {
6572 // magic constant: 0x1E0
6573 // To Set BIT_MODE : bit 15 = 0
6574 // XOR mask : bit [14:10] = 0
6575 // OR mask : bit [9:5] = 15
6576 // AND mask : bit [4:0] = 0
6577 if (is32BitOpc) {
6578 Register SwizzledValue =
6579 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6580 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::DS_SWIZZLE_B32),
6581 DestReg: SwizzledValue)
6582 .addReg(RegNo: DPPRowShr8) // addr
6583 .addImm(Val: 0x1E0) // swizzle offset (i16)
6584 .addImm(Val: 0x0); // gds (i1)
6585 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
6586 } else {
6587 Register SwizzledValuelo =
6588 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6589 Register SwizzledValuehi =
6590 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6591 Register SwizzledValue64 = MRI.createVirtualRegister(RegClass: SrcRegClass);
6592 MachineOperand DPPRowShr8Op =
6593 MachineOperand::CreateReg(Reg: DPPRowShr8, /*isDef=*/false);
6594 auto [Op1L, Op1H] =
6595 ExtractSubRegs(MI, Op&: DPPRowShr8Op, SrcRC: SrcRegClass, ST, MRI);
6596 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::DS_SWIZZLE_B32),
6597 DestReg: SwizzledValuelo)
6598 .addReg(RegNo: Op1L) // addr
6599 .addImm(Val: 0x1E0) // swizzle offset (i16)
6600 .addImm(Val: 0x0); // gds (i1)
6601 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::DS_SWIZZLE_B32),
6602 DestReg: SwizzledValuehi)
6603 .addReg(RegNo: Op1H) // addr
6604 .addImm(Val: 0x1E0) // swizzle offset (i16)
6605 .addImm(Val: 0x0); // gds (i1)
6606 BuildRegSequence(*CurrBB, MI, SwizzledValue64, SwizzledValuelo,
6607 SwizzledValuehi);
6608 if (NeedsMovDPP)
6609 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
6610 else
6611 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
6612 }
6613 }
6614 FinalDPPResult = RowBcast15;
6615 if (!IsWave32) {
6616 if (ST.hasDPPBroadcasts()) {
6617 BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);
6618 if (NeedsMovDPP)
6619 RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
6620 } else {
6621 Register ShiftedThreadID =
6622 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6623 Register PermuteByteOffset =
6624 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6625 Register PermutedValue = MRI.createVirtualRegister(RegClass: SrcRegClass);
6626 Register Lane32Offset =
6627 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6628 Register WordSizeConst =
6629 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6630 Register ThreadIDRegLo =
6631 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6632 Register ThreadIDReg =
6633 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6634 // Get the thread ID.
6635 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MBCNT_LO_U32_B32_e64),
6636 DestReg: ThreadIDRegLo)
6637 .addImm(Val: -1)
6638 .addImm(Val: 0);
6639 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MBCNT_HI_U32_B32_e64),
6640 DestReg: ThreadIDReg)
6641 .addImm(Val: -1)
6642 .addReg(RegNo: ThreadIDRegLo);
6643 // shift each lane over by 32 positions, so value in 31st lane is
6644 // present in 63rd lane.
6645 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: Lane32Offset)
6646 .addImm(Val: 0x20);
6647 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_ADD_U32_e64),
6648 DestReg: ShiftedThreadID)
6649 .addReg(RegNo: ThreadIDReg)
6650 .addReg(RegNo: Lane32Offset)
6651 .addImm(Val: 0); // clamp
6652 // multiply by reg size.
6653 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: WordSizeConst)
6654 .addImm(Val: 0x4);
6655 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MUL_LO_U32_e64),
6656 DestReg: PermuteByteOffset)
6657 .addReg(RegNo: WordSizeConst)
6658 .addReg(RegNo: ShiftedThreadID);
6659 // Permute the lanes
6660 if (is32BitOpc) {
6661 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::DS_PERMUTE_B32),
6662 DestReg: PermutedValue)
6663 .addReg(RegNo: PermuteByteOffset) // addr
6664 .addReg(RegNo: RowBcast15) // data
6665 .addImm(Val: 0); // offset
6666 } else {
6667 Register PermutedValuelo =
6668 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6669 Register PermutedValuehi =
6670 MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6671 MachineOperand RowBcast15Op =
6672 MachineOperand::CreateReg(Reg: RowBcast15, /*isDef=*/false);
6673 auto [RowBcast15Lo, RowBcast15Hi] =
6674 ExtractSubRegs(MI, Op&: RowBcast15Op, SrcRC: SrcRegClass, ST, MRI);
6675 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::DS_PERMUTE_B32),
6676 DestReg: PermutedValuelo)
6677 .addReg(RegNo: PermuteByteOffset) // addr
6678 .addReg(RegNo: RowBcast15Lo) // swizzle offset (i16)
6679 .addImm(Val: 0x0); // gds (i1)
6680 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::DS_PERMUTE_B32),
6681 DestReg: PermutedValuehi)
6682 .addReg(RegNo: PermuteByteOffset) // addr
6683 .addReg(RegNo: RowBcast15Hi) // swizzle offset (i16)
6684 .addImm(Val: 0x0); // gds (i1)
6685 BuildRegSequence(*CurrBB, MI, PermutedValue, PermutedValuelo,
6686 PermutedValuehi);
6687 }
6688 if (NeedsMovDPP)
6689 RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
6690 else
6691 BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
6692 }
6693 FinalDPPResult = RowBcast31;
6694 }
6695 if (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6696 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {
6697 Register NegatedValVGPR = MRI.createVirtualRegister(RegClass: SrcRegClass);
6698 // Opc for f32 reduction is V_SUB_F32.
6699 // For f64, there is no equivalent V_SUB_F64 opcode, so use
6700 // V_ADD_F64/V_ADD_F64_pseudo, and negate the second operand.
6701 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc),
6702 DestReg: NegatedValVGPR)
6703 .addImm(Val: SISrcMods::NONE) // src0 mods
6704 .addReg(RegNo: IdentityVGPR) // src0
6705 .addImm(Val: is32BitOpc ? SISrcMods::NONE : SISrcMods::NEG) // src1 mods
6706 .addReg(RegNo: IsWave32 ? RowBcast15 : RowBcast31) // src1
6707 .addImm(Val: SISrcMods::NONE) // clamp
6708 .addImm(Val: SISrcMods::NONE); // omod
6709 FinalDPPResult = NegatedValVGPR;
6710 }
6711 // The final reduced value is in the last lane.
6712 if (is32BitOpc) {
6713 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6714 DestReg: ReducedValSGPR)
6715 .addReg(RegNo: FinalDPPResult)
6716 .addImm(Val: ST.getWavefrontSize() - 1);
6717 } else {
6718 Register LaneValueLoReg =
6719 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6720 Register LaneValueHiReg =
6721 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6722 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
6723 MachineOperand FinalDPPResultOperand =
6724 MachineOperand::CreateReg(Reg: FinalDPPResult, /*isDef=*/false);
6725 auto [Op1L, Op1H] =
6726 ExtractSubRegs(MI, Op&: FinalDPPResultOperand, SrcRC, ST, MRI);
6727 // lane value input should be in an sgpr
6728 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6729 DestReg: LaneValueLoReg)
6730 .addReg(RegNo: Op1L)
6731 .addImm(Val: ST.getWavefrontSize() - 1);
6732 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6733 DestReg: LaneValueHiReg)
6734 .addReg(RegNo: Op1H)
6735 .addImm(Val: ST.getWavefrontSize() - 1);
6736 BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
6737 LaneValueHiReg);
6738 }
6739 if (Opc == AMDGPU::S_SUB_I32) {
6740 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedReducedVal)
6741 .addImm(Val: 0)
6742 .addReg(RegNo: ReducedValSGPR);
6743 } else if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6744 auto NegatedValInstr =
6745 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: NegatedReducedVal)
6746 .addImm(Val: 0)
6747 .addReg(RegNo: ReducedValSGPR);
6748 CurrBB = expand64BitScalarArithmetic(MI&: *NegatedValInstr, BB: CurrBB);
6749 }
6750 // Mark the final result as a whole-wave-mode calculation.
6751 BuildMI(BB&: *CurrBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::STRICT_WWM), DestReg: DstReg)
6752 .addReg(RegNo: Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U64_PSEUDO
6753 ? NegatedReducedVal
6754 : ReducedValSGPR);
6755 RetBB = CurrBB;
6756 }
6757 }
6758 MI.eraseFromParent();
6759 return RetBB;
6760}
6761
6762MachineBasicBlock *
6763SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6764 MachineBasicBlock *BB) const {
6765 MachineFunction *MF = BB->getParent();
6766 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
6767 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6768 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6769 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6770 MachineRegisterInfo &MRI = MF->getRegInfo();
6771 const DebugLoc &DL = MI.getDebugLoc();
6772
6773 switch (MI.getOpcode()) {
6774 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6775 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_U32);
6776 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6777 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_LT_U64_e64);
6778 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6779 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_I32);
6780 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6781 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_LT_I64_e64);
6782 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6783 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_MIN_F32_e64);
6784 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6785 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6786 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6787 ? AMDGPU::V_MIN_NUM_F64_e64
6788 : AMDGPU::V_MIN_F64_e64);
6789 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6790 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_U32);
6791 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6792 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_GT_U64_e64);
6793 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6794 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_I32);
6795 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6796 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_GT_I64_e64);
6797 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6798 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_MAX_F32_e64);
6799 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6800 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6801 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6802 ? AMDGPU::V_MAX_NUM_F64_e64
6803 : AMDGPU::V_MAX_F64_e64);
6804 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6805 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_I32);
6806 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6807 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_U64_PSEUDO);
6808 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6809 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_ADD_F32_e64);
6810 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6811 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6812 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6813 ? AMDGPU::V_ADD_F64_pseudo_e64
6814 : AMDGPU::V_ADD_F64_e64);
6815 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6816 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_I32);
6817 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6818 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_U64_PSEUDO);
6819 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6820 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_SUB_F32_e64);
6821 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6822 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6823 // fadd + neg, by setting the NEG bit in the instruction.
6824 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6825 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6826 ? AMDGPU::V_ADD_F64_pseudo_e64
6827 : AMDGPU::V_ADD_F64_e64);
6828 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6829 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B32);
6830 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6831 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B64);
6832 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6833 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B32);
6834 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6835 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B64);
6836 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6837 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B32);
6838 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6839 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B64);
6840 case AMDGPU::S_UADDO_PSEUDO:
6841 case AMDGPU::S_USUBO_PSEUDO: {
6842 MachineOperand &Dest0 = MI.getOperand(i: 0);
6843 MachineOperand &Dest1 = MI.getOperand(i: 1);
6844 MachineOperand &Src0 = MI.getOperand(i: 2);
6845 MachineOperand &Src1 = MI.getOperand(i: 3);
6846
6847 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6848 ? AMDGPU::S_ADD_U32
6849 : AMDGPU::S_SUB_U32;
6850 // clang-format off
6851 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg())
6852 .add(MO: Src0)
6853 .add(MO: Src1);
6854 // clang-format on
6855
6856 unsigned SelOpc =
6857 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6858 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: Dest1.getReg()).addImm(Val: -1).addImm(Val: 0);
6859
6860 MI.eraseFromParent();
6861 return BB;
6862 }
6863 case AMDGPU::S_ADD_U64_PSEUDO:
6864 case AMDGPU::S_SUB_U64_PSEUDO: {
6865 return expand64BitScalarArithmetic(MI, BB);
6866 }
6867 case AMDGPU::V_ADD_U64_PSEUDO:
6868 case AMDGPU::V_SUB_U64_PSEUDO: {
6869 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6870
6871 MachineOperand &Dest = MI.getOperand(i: 0);
6872 MachineOperand &Src0 = MI.getOperand(i: 1);
6873 MachineOperand &Src1 = MI.getOperand(i: 2);
6874
6875 if (ST.hasAddSubU64Insts()) {
6876 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL,
6877 MCID: TII->get(Opcode: IsAdd ? AMDGPU::V_ADD_U64_e64
6878 : AMDGPU::V_SUB_U64_e64),
6879 DestReg: Dest.getReg())
6880 .add(MO: Src0)
6881 .add(MO: Src1)
6882 .addImm(Val: 0); // clamp
6883 TII->legalizeOperands(MI&: *I);
6884 MI.eraseFromParent();
6885 return BB;
6886 }
6887
6888 if (IsAdd && ST.hasLshlAddU64Inst()) {
6889 auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64),
6890 DestReg: Dest.getReg())
6891 .add(MO: Src0)
6892 .addImm(Val: 0)
6893 .add(MO: Src1);
6894 TII->legalizeOperands(MI&: *Add);
6895 MI.eraseFromParent();
6896 return BB;
6897 }
6898
6899 const auto *CarryRC = TRI->getWaveMaskRegClass();
6900
6901 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6902 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6903
6904 Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6905 Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6906
6907 const TargetRegisterClass *Src0RC = Src0.isReg()
6908 ? MRI.getRegClass(Reg: Src0.getReg())
6909 : &AMDGPU::VReg_64RegClass;
6910 const TargetRegisterClass *Src1RC = Src1.isReg()
6911 ? MRI.getRegClass(Reg: Src1.getReg())
6912 : &AMDGPU::VReg_64RegClass;
6913
6914 const TargetRegisterClass *Src0SubRC =
6915 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6916 const TargetRegisterClass *Src1SubRC =
6917 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6918
6919 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6920 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6921 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6922 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6923
6924 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6925 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6926 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6927 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6928
6929 unsigned LoOpc =
6930 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6931 MachineInstr *LoHalf = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
6932 .addReg(RegNo: CarryReg, Flags: RegState::Define)
6933 .add(MO: SrcReg0Sub0)
6934 .add(MO: SrcReg1Sub0)
6935 .addImm(Val: 0); // clamp bit
6936
6937 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6938 MachineInstr *HiHalf =
6939 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
6940 .addReg(RegNo: DeadCarryReg, Flags: RegState::Define | RegState::Dead)
6941 .add(MO: SrcReg0Sub1)
6942 .add(MO: SrcReg1Sub1)
6943 .addReg(RegNo: CarryReg, Flags: RegState::Kill)
6944 .addImm(Val: 0); // clamp bit
6945
6946 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
6947 .addReg(RegNo: DestSub0)
6948 .addImm(Val: AMDGPU::sub0)
6949 .addReg(RegNo: DestSub1)
6950 .addImm(Val: AMDGPU::sub1);
6951 TII->legalizeOperands(MI&: *LoHalf);
6952 TII->legalizeOperands(MI&: *HiHalf);
6953 MI.eraseFromParent();
6954 return BB;
6955 }
6956 case AMDGPU::S_ADD_CO_PSEUDO:
6957 case AMDGPU::S_SUB_CO_PSEUDO: {
6958 // This pseudo has a chance to be selected
6959 // only from uniform add/subcarry node. All the VGPR operands
6960 // therefore assumed to be splat vectors.
6961 MachineBasicBlock::iterator MII = MI;
6962 MachineOperand &Dest = MI.getOperand(i: 0);
6963 MachineOperand &CarryDest = MI.getOperand(i: 1);
6964 MachineOperand &Src0 = MI.getOperand(i: 2);
6965 MachineOperand &Src1 = MI.getOperand(i: 3);
6966 MachineOperand &Src2 = MI.getOperand(i: 4);
6967 if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
6968 Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6969 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0)
6970 .addReg(RegNo: Src0.getReg());
6971 Src0.setReg(RegOp0);
6972 }
6973 if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
6974 Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6975 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1)
6976 .addReg(RegNo: Src1.getReg());
6977 Src1.setReg(RegOp1);
6978 }
6979 Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6980 if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
6981 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2)
6982 .addReg(RegNo: Src2.getReg());
6983 Src2.setReg(RegOp2);
6984 }
6985
6986 if (ST.isWave64()) {
6987 if (ST.hasScalarCompareEq64()) {
6988 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64))
6989 .addReg(RegNo: Src2.getReg())
6990 .addImm(Val: 0);
6991 } else {
6992 const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
6993 const TargetRegisterClass *SubRC =
6994 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6995 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6996 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC);
6997 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6998 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC);
6999 Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7000
7001 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32)
7002 .add(MO: Src2Sub0)
7003 .add(MO: Src2Sub1);
7004
7005 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
7006 .addReg(RegNo: Src2_32, Flags: RegState::Kill)
7007 .addImm(Val: 0);
7008 }
7009 } else {
7010 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
7011 .addReg(RegNo: Src2.getReg())
7012 .addImm(Val: 0);
7013 }
7014
7015 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
7016 ? AMDGPU::S_ADDC_U32
7017 : AMDGPU::S_SUBB_U32;
7018
7019 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg()).add(MO: Src0).add(MO: Src1);
7020
7021 unsigned SelOpc =
7022 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
7023
7024 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg())
7025 .addImm(Val: -1)
7026 .addImm(Val: 0);
7027
7028 MI.eraseFromParent();
7029 return BB;
7030 }
7031 case AMDGPU::SI_INIT_M0: {
7032 MachineOperand &M0Init = MI.getOperand(i: 0);
7033 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
7034 MCID: TII->get(Opcode: M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
7035 DestReg: AMDGPU::M0)
7036 .add(MO: M0Init);
7037 MI.eraseFromParent();
7038 return BB;
7039 }
7040 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
7041 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7042 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
7043 MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
7044 .addImm(Val: 0)
7045 .addImm(Val: 0);
7046 return BB;
7047 }
7048 case AMDGPU::GET_GROUPSTATICSIZE: {
7049 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
7050 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
7051 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32))
7052 .add(MO: MI.getOperand(i: 0))
7053 .addImm(Val: MFI->getLDSSize());
7054 MI.eraseFromParent();
7055 return BB;
7056 }
7057 case AMDGPU::GET_SHADERCYCLESHILO: {
7058 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
7059 // The algorithm is:
7060 //
7061 // hi1 = getreg(SHADER_CYCLES_HI)
7062 // lo1 = getreg(SHADER_CYCLES_LO)
7063 // hi2 = getreg(SHADER_CYCLES_HI)
7064 //
7065 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
7066 // Otherwise there was overflow and the result is hi2:0. In both cases the
7067 // result should represent the actual time at some point during the sequence
7068 // of three getregs.
7069 using namespace AMDGPU::Hwreg;
7070 Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7071 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1)
7072 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
7073 Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7074 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1)
7075 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: 0, Values: 32));
7076 Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7077 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2)
7078 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
7079 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
7080 .addReg(RegNo: RegHi1)
7081 .addReg(RegNo: RegHi2);
7082 Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
7083 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo)
7084 .addReg(RegNo: RegLo1)
7085 .addImm(Val: 0);
7086 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE))
7087 .add(MO: MI.getOperand(i: 0))
7088 .addReg(RegNo: RegLo)
7089 .addImm(Val: AMDGPU::sub0)
7090 .addReg(RegNo: RegHi2)
7091 .addImm(Val: AMDGPU::sub1);
7092 MI.eraseFromParent();
7093 return BB;
7094 }
7095 case AMDGPU::SI_INDIRECT_SRC_V1:
7096 case AMDGPU::SI_INDIRECT_SRC_V2:
7097 case AMDGPU::SI_INDIRECT_SRC_V3:
7098 case AMDGPU::SI_INDIRECT_SRC_V4:
7099 case AMDGPU::SI_INDIRECT_SRC_V5:
7100 case AMDGPU::SI_INDIRECT_SRC_V6:
7101 case AMDGPU::SI_INDIRECT_SRC_V7:
7102 case AMDGPU::SI_INDIRECT_SRC_V8:
7103 case AMDGPU::SI_INDIRECT_SRC_V9:
7104 case AMDGPU::SI_INDIRECT_SRC_V10:
7105 case AMDGPU::SI_INDIRECT_SRC_V11:
7106 case AMDGPU::SI_INDIRECT_SRC_V12:
7107 case AMDGPU::SI_INDIRECT_SRC_V16:
7108 case AMDGPU::SI_INDIRECT_SRC_V32:
7109 return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget());
7110 case AMDGPU::SI_INDIRECT_DST_V1:
7111 case AMDGPU::SI_INDIRECT_DST_V2:
7112 case AMDGPU::SI_INDIRECT_DST_V3:
7113 case AMDGPU::SI_INDIRECT_DST_V4:
7114 case AMDGPU::SI_INDIRECT_DST_V5:
7115 case AMDGPU::SI_INDIRECT_DST_V6:
7116 case AMDGPU::SI_INDIRECT_DST_V7:
7117 case AMDGPU::SI_INDIRECT_DST_V8:
7118 case AMDGPU::SI_INDIRECT_DST_V9:
7119 case AMDGPU::SI_INDIRECT_DST_V10:
7120 case AMDGPU::SI_INDIRECT_DST_V11:
7121 case AMDGPU::SI_INDIRECT_DST_V12:
7122 case AMDGPU::SI_INDIRECT_DST_V16:
7123 case AMDGPU::SI_INDIRECT_DST_V32:
7124 return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget());
7125 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7126 case AMDGPU::SI_KILL_I1_PSEUDO:
7127 return splitKillBlock(MI, BB);
7128 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
7129 expand64BitV_CNDMASK(MI, BB);
7130 return BB;
7131 }
7132 case AMDGPU::SI_BR_UNDEF: {
7133 MachineInstr *Br = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
7134 .add(MO: MI.getOperand(i: 0));
7135 Br->getOperand(i: 1).setIsUndef(); // read undef SCC
7136 MI.eraseFromParent();
7137 return BB;
7138 }
7139 case AMDGPU::ADJCALLSTACKUP:
7140 case AMDGPU::ADJCALLSTACKDOWN: {
7141 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
7142 MachineInstrBuilder MIB(*MF, &MI);
7143 MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::ImplicitDefine)
7144 .addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::Implicit);
7145 return BB;
7146 }
7147 case AMDGPU::SI_CALL_ISEL: {
7148 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
7149
7150 MachineInstrBuilder MIB;
7151 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg);
7152
7153 for (const MachineOperand &MO : MI.operands())
7154 MIB.add(MO);
7155
7156 MIB.cloneMemRefs(OtherMI: MI);
7157 MI.eraseFromParent();
7158 return BB;
7159 }
7160 case AMDGPU::V_ADD_CO_U32_e32:
7161 case AMDGPU::V_SUB_CO_U32_e32:
7162 case AMDGPU::V_SUBREV_CO_U32_e32: {
7163 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
7164 unsigned Opc = MI.getOpcode();
7165
7166 bool NeedClampOperand = false;
7167 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) {
7168 Opc = AMDGPU::getVOPe64(Opcode: Opc);
7169 NeedClampOperand = true;
7170 }
7171
7172 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg());
7173 if (TII->isVOP3(MI: *I)) {
7174 I.addReg(RegNo: TRI->getVCC(), Flags: RegState::Define);
7175 }
7176 I.add(MO: MI.getOperand(i: 1)).add(MO: MI.getOperand(i: 2));
7177 if (NeedClampOperand)
7178 I.addImm(Val: 0); // clamp bit for e64 encoding
7179
7180 TII->legalizeOperands(MI&: *I);
7181
7182 MI.eraseFromParent();
7183 return BB;
7184 }
7185 case AMDGPU::V_ADDC_U32_e32:
7186 case AMDGPU::V_SUBB_U32_e32:
7187 case AMDGPU::V_SUBBREV_U32_e32:
7188 // These instructions have an implicit use of vcc which counts towards the
7189 // constant bus limit.
7190 TII->legalizeOperands(MI);
7191 return BB;
7192 case AMDGPU::DS_GWS_INIT:
7193 case AMDGPU::DS_GWS_SEMA_BR:
7194 case AMDGPU::DS_GWS_BARRIER:
7195 case AMDGPU::DS_GWS_SEMA_V:
7196 case AMDGPU::DS_GWS_SEMA_P:
7197 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
7198 // A s_waitcnt 0 is required to be the instruction immediately following.
7199 if (getSubtarget()->hasGWSAutoReplay()) {
7200 bundleInstWithWaitcnt(MI);
7201 return BB;
7202 }
7203
7204 return emitGWSMemViolTestLoop(MI, BB);
7205 case AMDGPU::S_SETREG_B32: {
7206 // Try to optimize cases that only set the denormal mode or rounding mode.
7207 //
7208 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
7209 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
7210 // instead.
7211 //
7212 // FIXME: This could be predicates on the immediate, but tablegen doesn't
7213 // allow you to have a no side effect instruction in the output of a
7214 // sideeffecting pattern.
7215 auto [ID, Offset, Width] =
7216 AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: 1).getImm());
7217 if (ID != AMDGPU::Hwreg::ID_MODE)
7218 return BB;
7219
7220 const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width);
7221 const unsigned SetMask = WidthMask << Offset;
7222
7223 if (getSubtarget()->hasDenormModeInst()) {
7224 unsigned SetDenormOp = 0;
7225 unsigned SetRoundOp = 0;
7226
7227 // The dedicated instructions can only set the whole denorm or round mode
7228 // at once, not a subset of bits in either.
7229 if (SetMask ==
7230 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
7231 // If this fully sets both the round and denorm mode, emit the two
7232 // dedicated instructions for these.
7233 SetRoundOp = AMDGPU::S_ROUND_MODE;
7234 SetDenormOp = AMDGPU::S_DENORM_MODE;
7235 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
7236 SetRoundOp = AMDGPU::S_ROUND_MODE;
7237 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
7238 SetDenormOp = AMDGPU::S_DENORM_MODE;
7239 }
7240
7241 if (SetRoundOp || SetDenormOp) {
7242 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg());
7243 if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) {
7244 unsigned ImmVal = Def->getOperand(i: 1).getImm();
7245 if (SetRoundOp) {
7246 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp))
7247 .addImm(Val: ImmVal & 0xf);
7248
7249 // If we also have the denorm mode, get just the denorm mode bits.
7250 ImmVal >>= 4;
7251 }
7252
7253 if (SetDenormOp) {
7254 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp))
7255 .addImm(Val: ImmVal & 0xf);
7256 }
7257
7258 MI.eraseFromParent();
7259 return BB;
7260 }
7261 }
7262 }
7263
7264 // If only FP bits are touched, used the no side effects pseudo.
7265 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
7266 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
7267 MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode));
7268
7269 return BB;
7270 }
7271 case AMDGPU::S_INVERSE_BALLOT_U32:
7272 case AMDGPU::S_INVERSE_BALLOT_U64:
7273 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
7274 // necessary. After that they are equivalent to a COPY.
7275 MI.setDesc(TII->get(Opcode: AMDGPU::COPY));
7276 return BB;
7277 case AMDGPU::ENDPGM_TRAP: {
7278 if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
7279 MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM));
7280 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
7281 return BB;
7282 }
7283
7284 // We need a block split to make the real endpgm a terminator. We also don't
7285 // want to break phis in successor blocks, so we can't just delete to the
7286 // end of the block.
7287
7288 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
7289 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
7290 MF->push_back(MBB: TrapBB);
7291 // clang-format off
7292 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM))
7293 .addImm(Val: 0);
7294 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
7295 .addMBB(MBB: TrapBB);
7296 // clang-format on
7297
7298 BB->addSuccessor(Succ: TrapBB);
7299 MI.eraseFromParent();
7300 return SplitBB;
7301 }
7302 case AMDGPU::SIMULATED_TRAP: {
7303 assert(Subtarget->hasPrivEnabledTrap2NopBug());
7304 MachineBasicBlock *SplitBB =
7305 TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
7306 MI.eraseFromParent();
7307 return SplitBB;
7308 }
7309 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
7310 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
7311 assert(MFI->isWholeWaveFunction());
7312
7313 // During ISel, it's difficult to propagate the original EXEC mask to use as
7314 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
7315 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF&: *BB->getParent());
7316 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
7317 Register OriginalExec = Setup->getOperand(i: 0).getReg();
7318 MF->getRegInfo().clearKillFlags(Reg: OriginalExec);
7319 MI.getOperand(i: 0).setReg(OriginalExec);
7320 return BB;
7321 }
7322 default:
7323 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
7324 if (!MI.mayStore())
7325 AddMemOpInit(MI);
7326 return BB;
7327 }
7328 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
7329 }
7330}
7331
7332bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
7333 // This currently forces unfolding various combinations of fsub into fma with
7334 // free fneg'd operands. As long as we have fast FMA (controlled by
7335 // isFMAFasterThanFMulAndFAdd), we should perform these.
7336
7337 // When fma is quarter rate, for f64 where add / sub are at best half rate,
7338 // most of these combines appear to be cycle neutral but save on instruction
7339 // count / code size.
7340 return true;
7341}
7342
7343bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
7344
7345EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
7346 EVT VT) const {
7347 if (!VT.isVector()) {
7348 return MVT::i1;
7349 }
7350 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements());
7351}
7352
7353MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
7354 // TODO: Should i16 be used always if legal? For now it would force VALU
7355 // shifts.
7356 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7357}
7358
7359LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
7360 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7361 ? Ty.changeElementSize(NewEltSize: 16)
7362 : Ty.changeElementSize(NewEltSize: 32);
7363}
7364
7365// Answering this is somewhat tricky and depends on the specific device which
7366// have different rates for fma or all f64 operations.
7367//
7368// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
7369// regardless of which device (although the number of cycles differs between
7370// devices), so it is always profitable for f64.
7371//
7372// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
7373// only on full rate devices. Normally, we should prefer selecting v_mad_f32
7374// which we can always do even without fused FP ops since it returns the same
7375// result as the separate operations and since it is always full
7376// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
7377// however does not support denormals, so we do report fma as faster if we have
7378// a fast fma device and require denormals.
7379//
7380bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
7381 EVT VT) const {
7382 VT = VT.getScalarType();
7383
7384 switch (VT.getSimpleVT().SimpleTy) {
7385 case MVT::f32: {
7386 // If mad is not available this depends only on if f32 fma is full rate.
7387 if (!Subtarget->hasMadMacF32Insts())
7388 return Subtarget->hasFastFMAF32();
7389
7390 // Otherwise f32 mad is always full rate and returns the same result as
7391 // the separate operations so should be preferred over fma.
7392 // However does not support denormals.
7393 if (!denormalModeIsFlushAllF32(MF))
7394 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7395
7396 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
7397 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7398 }
7399 case MVT::f64:
7400 return true;
7401 case MVT::f16:
7402 case MVT::bf16:
7403 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
7404 default:
7405 break;
7406 }
7407
7408 return false;
7409}
7410
7411bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
7412 LLT Ty) const {
7413 switch (Ty.getScalarSizeInBits()) {
7414 case 16:
7415 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16);
7416 case 32:
7417 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32);
7418 case 64:
7419 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64);
7420 default:
7421 break;
7422 }
7423
7424 return false;
7425}
7426
7427bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
7428 if (!Ty.isScalar())
7429 return false;
7430
7431 if (Ty.getScalarSizeInBits() == 16)
7432 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
7433 if (Ty.getScalarSizeInBits() == 32)
7434 return Subtarget->hasMadMacF32Insts() &&
7435 denormalModeIsFlushAllF32(MF: *MI.getMF());
7436
7437 return false;
7438}
7439
7440bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
7441 const SDNode *N) const {
7442 // TODO: Check future ftz flag
7443 // v_mad_f32/v_mac_f32 do not support denormals.
7444 EVT VT = N->getValueType(ResNo: 0);
7445 if (VT == MVT::f32)
7446 return Subtarget->hasMadMacF32Insts() &&
7447 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
7448 if (VT == MVT::f16) {
7449 return Subtarget->hasMadF16() &&
7450 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
7451 }
7452
7453 return false;
7454}
7455
7456//===----------------------------------------------------------------------===//
7457// Custom DAG Lowering Operations
7458//===----------------------------------------------------------------------===//
7459
7460// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7461// wider vector type is legal.
7462SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
7463 SelectionDAG &DAG) const {
7464 unsigned Opc = Op.getOpcode();
7465 EVT VT = Op.getValueType();
7466 assert(VT.isVector() && VT.getVectorElementCount().isKnownEven());
7467
7468 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
7469 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT);
7470
7471 SDLoc SL(Op);
7472
7473 // Forward any trailing scalar operands unchanged to both halves.
7474 SmallVector<SDValue, 2> LoOps = {Lo};
7475 SmallVector<SDValue, 2> HiOps = {Hi};
7476 auto TrailingOps = drop_begin(RangeOrContainer: Op->ops());
7477 LoOps.append(in_start: TrailingOps.begin(), in_end: TrailingOps.end());
7478 HiOps.append(in_start: TrailingOps.begin(), in_end: TrailingOps.end());
7479
7480 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: LoVT, Ops: LoOps, Flags: Op->getFlags());
7481 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: HiVT, Ops: HiOps, Flags: Op->getFlags());
7482
7483 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
7484}
7485
7486// Enable lowering of ROTR for vxi32 types. This is a workaround for a
7487// regression whereby extra unnecessary instructions were added to codegen
7488// for rotr operations, casued by legalising v2i32 or. This resulted in extra
7489// instructions to extract the result from the vector.
7490SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
7491 [[maybe_unused]] EVT VT = Op.getValueType();
7492
7493 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7494 VT == MVT::v16i32) &&
7495 "Unexpected ValueType.");
7496
7497 return DAG.UnrollVectorOp(N: Op.getNode());
7498}
7499
7500// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7501// wider vector type is legal.
7502SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
7503 SelectionDAG &DAG) const {
7504 unsigned Opc = Op.getOpcode();
7505 EVT VT = Op.getValueType();
7506 assert(VT.isVector() && VT.getVectorElementCount().isKnownEven());
7507
7508 auto [Lo0, Hi0] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
7509 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
7510
7511 SDLoc SL(Op);
7512
7513 SDValue OpLo =
7514 DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, Flags: Op->getFlags());
7515 SDValue OpHi =
7516 DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, Flags: Op->getFlags());
7517
7518 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
7519}
7520
7521SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
7522 SelectionDAG &DAG) const {
7523 unsigned Opc = Op.getOpcode();
7524 EVT VT = Op.getValueType();
7525 assert(VT.isVector() && VT.getVectorElementCount().isKnownEven());
7526
7527 SDValue Op0 = Op.getOperand(i: 0);
7528 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7529 ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0)
7530 : std::pair(Op0, Op0);
7531
7532 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
7533 auto [Lo2, Hi2] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2);
7534
7535 SDLoc SL(Op);
7536 auto ResVT = DAG.GetSplitDestVTs(VT);
7537
7538 SDValue OpLo =
7539 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, Flags: Op->getFlags());
7540 SDValue OpHi =
7541 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, Flags: Op->getFlags());
7542
7543 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
7544}
7545
7546SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7547 switch (Op.getOpcode()) {
7548 default:
7549 return AMDGPUTargetLowering::LowerOperation(Op, DAG);
7550 case ISD::BRCOND:
7551 return LowerBRCOND(Op, DAG);
7552 case ISD::RETURNADDR:
7553 return LowerRETURNADDR(Op, DAG);
7554 case ISD::SPONENTRY:
7555 return LowerSPONENTRY(Op, DAG);
7556 case ISD::LOAD: {
7557 SDValue Result = LowerLOAD(Op, DAG);
7558 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7559 "Load should return a value and a chain");
7560 return Result;
7561 }
7562 case ISD::FSQRT: {
7563 EVT VT = Op.getValueType();
7564 if (VT == MVT::f32)
7565 return lowerFSQRTF32(Op, DAG);
7566 if (VT == MVT::f64)
7567 return lowerFSQRTF64(Op, DAG);
7568 return SDValue();
7569 }
7570 case ISD::FSIN:
7571 case ISD::FCOS:
7572 return LowerTrig(Op, DAG);
7573 case ISD::SELECT:
7574 return LowerSELECT(Op, DAG);
7575 case ISD::FDIV:
7576 return LowerFDIV(Op, DAG);
7577 case ISD::FFREXP:
7578 return LowerFFREXP(Op, DAG);
7579 case ISD::ATOMIC_CMP_SWAP:
7580 return LowerATOMIC_CMP_SWAP(Op, DAG);
7581 case ISD::STORE:
7582 return LowerSTORE(Op, DAG);
7583 case ISD::GlobalAddress: {
7584 MachineFunction &MF = DAG.getMachineFunction();
7585 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7586 return LowerGlobalAddress(MFI, Op, DAG);
7587 }
7588 case ISD::BlockAddress:
7589 return LowerBlockAddress(Op, DAG);
7590 case ISD::ExternalSymbol:
7591 return LowerExternalSymbol(Op, DAG);
7592 case ISD::INTRINSIC_WO_CHAIN:
7593 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7594 case ISD::INTRINSIC_W_CHAIN:
7595 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7596 case ISD::INTRINSIC_VOID:
7597 return LowerINTRINSIC_VOID(Op, DAG);
7598 case ISD::ADDRSPACECAST:
7599 return lowerADDRSPACECAST(Op, DAG);
7600 case ISD::INSERT_SUBVECTOR:
7601 return lowerINSERT_SUBVECTOR(Op, DAG);
7602 case ISD::INSERT_VECTOR_ELT:
7603 return lowerINSERT_VECTOR_ELT(Op, DAG);
7604 case ISD::EXTRACT_VECTOR_ELT:
7605 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7606 case ISD::VECTOR_SHUFFLE:
7607 return lowerVECTOR_SHUFFLE(Op, DAG);
7608 case ISD::SCALAR_TO_VECTOR:
7609 return lowerSCALAR_TO_VECTOR(Op, DAG);
7610 case ISD::BUILD_VECTOR:
7611 return lowerBUILD_VECTOR(Op, DAG);
7612 case ISD::FP_ROUND:
7613 case ISD::STRICT_FP_ROUND:
7614 return lowerFP_ROUND(Op, DAG);
7615 case ISD::TRAP:
7616 return lowerTRAP(Op, DAG);
7617 case ISD::DEBUGTRAP:
7618 return lowerDEBUGTRAP(Op, DAG);
7619 case ISD::ABS:
7620 case ISD::FABS:
7621 case ISD::FNEG:
7622 case ISD::FCANONICALIZE:
7623 case ISD::BSWAP:
7624 return splitUnaryVectorOp(Op, DAG);
7625 case ISD::FP_TO_SINT_SAT:
7626 case ISD::FP_TO_UINT_SAT:
7627 if (Op.getValueType().isVector() && Op.getValueType() != MVT::v2i16 &&
7628 Op.getOperand(i: 0).getValueType().getScalarType() == MVT::f32)
7629 return splitUnaryVectorOp(Op, DAG);
7630 return LowerFP_TO_INT_SAT(Op, DAG);
7631 case ISD::FMINNUM:
7632 case ISD::FMAXNUM:
7633 return lowerFMINNUM_FMAXNUM(Op, DAG);
7634 case ISD::FMINIMUMNUM:
7635 case ISD::FMAXIMUMNUM:
7636 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7637 case ISD::FMINIMUM:
7638 case ISD::FMAXIMUM:
7639 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7640 case ISD::FLDEXP:
7641 case ISD::STRICT_FLDEXP:
7642 return lowerFLDEXP(Op, DAG);
7643 case ISD::FMA:
7644 return splitTernaryVectorOp(Op, DAG);
7645 case ISD::FP_TO_SINT:
7646 case ISD::FP_TO_UINT:
7647 if (Subtarget->hasVCvtPkIU16F32() && Op.getValueType() == MVT::i16 &&
7648 Op.getOperand(i: 0).getValueType() == MVT::f32) {
7649 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7650 return Op;
7651 }
7652 return LowerFP_TO_INT(Op, DAG);
7653 case ISD::SHL:
7654 case ISD::SRA:
7655 case ISD::SRL:
7656 case ISD::ADD:
7657 case ISD::SUB:
7658 case ISD::SMIN:
7659 case ISD::SMAX:
7660 case ISD::UMIN:
7661 case ISD::UMAX:
7662 case ISD::FADD:
7663 case ISD::FMUL:
7664 case ISD::FMINNUM_IEEE:
7665 case ISD::FMAXNUM_IEEE:
7666 case ISD::UADDSAT:
7667 case ISD::USUBSAT:
7668 case ISD::SADDSAT:
7669 case ISD::SSUBSAT:
7670 return splitBinaryVectorOp(Op, DAG);
7671 case ISD::FCOPYSIGN:
7672 return lowerFCOPYSIGN(Op, DAG);
7673 case ISD::MUL:
7674 return lowerMUL(Op, DAG);
7675 case ISD::SMULO:
7676 case ISD::UMULO:
7677 return lowerXMULO(Op, DAG);
7678 case ISD::SMUL_LOHI:
7679 case ISD::UMUL_LOHI:
7680 return lowerXMUL_LOHI(Op, DAG);
7681 case ISD::DYNAMIC_STACKALLOC:
7682 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7683 case ISD::STACKSAVE:
7684 return LowerSTACKSAVE(Op, DAG);
7685 case ISD::GET_ROUNDING:
7686 return lowerGET_ROUNDING(Op, DAG);
7687 case ISD::SET_ROUNDING:
7688 return lowerSET_ROUNDING(Op, DAG);
7689 case ISD::PREFETCH:
7690 return lowerPREFETCH(Op, DAG);
7691 case ISD::FP_EXTEND:
7692 case ISD::STRICT_FP_EXTEND:
7693 return lowerFP_EXTEND(Op, DAG);
7694 case ISD::GET_FPENV:
7695 return lowerGET_FPENV(Op, DAG);
7696 case ISD::SET_FPENV:
7697 return lowerSET_FPENV(Op, DAG);
7698 case ISD::ROTR:
7699 return lowerROTR(Op, DAG);
7700 case ISD::INLINEASM:
7701 return LowerINLINEASM(Op, DAG);
7702 }
7703 return SDValue();
7704}
7705
7706// Used for D16: Casts the result of an instruction into the right vector,
7707// packs values if loads return unpacked values.
7708static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
7709 const SDLoc &DL, SelectionDAG &DAG,
7710 bool Unpacked) {
7711 if (!LoadVT.isVector())
7712 return Result;
7713
7714 // Cast back to the original packed type or to a larger type that is a
7715 // multiple of 32 bit for D16. Widening the return type is a required for
7716 // legalization.
7717 EVT FittingLoadVT = LoadVT;
7718 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7719 FittingLoadVT =
7720 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7721 NumElements: LoadVT.getVectorNumElements() + 1);
7722 }
7723
7724 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7725 // Truncate to v2i16/v4i16.
7726 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7727
7728 // Workaround legalizer not scalarizing truncate after vector op
7729 // legalization but not creating intermediate vector trunc.
7730 SmallVector<SDValue, 4> Elts;
7731 DAG.ExtractVectorElements(Op: Result, Args&: Elts);
7732 for (SDValue &Elt : Elts)
7733 Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt);
7734
7735 // Pad illegal v1i16/v3fi6 to v4i16
7736 if ((LoadVT.getVectorNumElements() % 2) == 1)
7737 Elts.push_back(Elt: DAG.getPOISON(VT: MVT::i16));
7738
7739 Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
7740
7741 // Bitcast to original type (v2f16/v4f16).
7742 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7743 }
7744
7745 // Cast back to the original packed type.
7746 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7747}
7748
7749SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7750 SelectionDAG &DAG,
7751 ArrayRef<SDValue> Ops,
7752 bool IsIntrinsic) const {
7753 SDLoc DL(M);
7754
7755 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7756 EVT LoadVT = M->getValueType(ResNo: 0);
7757
7758 EVT EquivLoadVT = LoadVT;
7759 if (LoadVT.isVector()) {
7760 if (Unpacked) {
7761 EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
7762 NumElements: LoadVT.getVectorNumElements());
7763 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7764 // Widen v3f16 to legal type
7765 EquivLoadVT =
7766 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7767 NumElements: LoadVT.getVectorNumElements() + 1);
7768 }
7769 }
7770
7771 // Change from v4f16/v2f16 to EquivLoadVT.
7772 SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other);
7773
7774 SDValue Load = DAG.getMemIntrinsicNode(
7775 Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, VTList, Ops,
7776 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
7777
7778 SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
7779
7780 return DAG.getMergeValues(Ops: {Adjusted, Load.getValue(R: 1)}, dl: DL);
7781}
7782
7783SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7784 SelectionDAG &DAG,
7785 ArrayRef<SDValue> Ops) const {
7786 SDLoc DL(M);
7787 EVT LoadVT = M->getValueType(ResNo: 0);
7788 EVT EltType = LoadVT.getScalarType();
7789 EVT IntVT = LoadVT.changeTypeToInteger();
7790
7791 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7792
7793 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7794 bool IsTFE = M->getNumValues() == 3;
7795
7796 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7797 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7798 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7799 : AMDGPUISD::BUFFER_LOAD;
7800
7801 if (IsD16) {
7802 return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7803 }
7804
7805 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7806 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7807 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(),
7808 IsTFE);
7809
7810 if (isTypeLegal(VT: LoadVT)) {
7811 return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
7812 MMO: M->getMemOperand(), DAG);
7813 }
7814
7815 EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
7816 SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other);
7817 SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
7818 MMO: M->getMemOperand(), DAG);
7819 return DAG.getMergeValues(
7820 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)},
7821 dl: DL);
7822}
7823
7824static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7825 SelectionDAG &DAG) {
7826 EVT VT = N->getValueType(ResNo: 0);
7827 unsigned CondCode = N->getConstantOperandVal(Num: 3);
7828 if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
7829 return DAG.getPOISON(VT);
7830
7831 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7832
7833 SDValue LHS = N->getOperand(Num: 1);
7834 SDValue RHS = N->getOperand(Num: 2);
7835
7836 SDLoc DL(N);
7837
7838 EVT CmpVT = LHS.getValueType();
7839 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) {
7840 unsigned PromoteOp =
7841 ICmpInst::isSigned(Pred: IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7842 LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS);
7843 RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS);
7844 }
7845
7846 ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
7847
7848 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7849 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7850
7851 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
7852 N3: DAG.getCondCode(Cond: CCOpcode));
7853 if (VT.bitsEq(VT: CCVT))
7854 return SetCC;
7855 return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
7856}
7857
7858static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7859 SelectionDAG &DAG) {
7860 EVT VT = N->getValueType(ResNo: 0);
7861
7862 unsigned CondCode = N->getConstantOperandVal(Num: 3);
7863 if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
7864 return DAG.getPOISON(VT);
7865
7866 SDValue Src0 = N->getOperand(Num: 1);
7867 SDValue Src1 = N->getOperand(Num: 2);
7868 EVT CmpVT = Src0.getValueType();
7869 SDLoc SL(N);
7870
7871 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) {
7872 Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
7873 Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
7874 }
7875
7876 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7877 ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
7878 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7879 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7880 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, N2: Src1,
7881 N3: DAG.getCondCode(Cond: CCOpcode));
7882 if (VT.bitsEq(VT: CCVT))
7883 return SetCC;
7884 return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
7885}
7886
7887static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
7888 SelectionDAG &DAG) {
7889 EVT VT = N->getValueType(ResNo: 0);
7890 SDValue Src = N->getOperand(Num: 1);
7891 SDLoc SL(N);
7892
7893 if (Src.getOpcode() == ISD::SETCC) {
7894 SDValue Op0 = Src.getOperand(i: 0);
7895 SDValue Op1 = Src.getOperand(i: 1);
7896 // Need to expand bfloat to float for comparison (setcc).
7897 if (Op0.getValueType() == MVT::bf16) {
7898 Op0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op0);
7899 Op1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op1);
7900 }
7901 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7902 return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Op0, N2: Op1, N3: Src.getOperand(i: 2));
7903 }
7904 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
7905 // (ballot 0) -> 0
7906 if (Arg->isZero())
7907 return DAG.getConstant(Val: 0, DL: SL, VT);
7908
7909 // (ballot 1) -> EXEC/EXEC_LO
7910 if (Arg->isOne()) {
7911 Register Exec;
7912 if (VT.getScalarSizeInBits() == 32)
7913 Exec = AMDGPU::EXEC_LO;
7914 else if (VT.getScalarSizeInBits() == 64)
7915 Exec = AMDGPU::EXEC;
7916 else
7917 return SDValue();
7918
7919 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
7920 }
7921 }
7922
7923 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7924 // ISD::SETNE)
7925 return DAG.getNode(
7926 Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32),
7927 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE));
7928}
7929
7930static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7931 EVT VT);
7932
7933static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
7934 SelectionDAG &DAG) {
7935 EVT VT = N->getValueType(ResNo: 0);
7936 unsigned ValSize = VT.getSizeInBits();
7937 unsigned IID = N->getConstantOperandVal(Num: 0);
7938 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7939 IID == Intrinsic::amdgcn_permlanex16;
7940 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7941 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7942 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
7943 IID == Intrinsic::amdgcn_permlane_up ||
7944 IID == Intrinsic::amdgcn_permlane_down ||
7945 IID == Intrinsic::amdgcn_permlane_xor;
7946 SDLoc SL(N);
7947 MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize);
7948 const GCNSubtarget *ST = TLI.getSubtarget();
7949
7950 if ((IsPermLane16 && !ST->hasPermlane16Insts()) ||
7951 (IID == Intrinsic::amdgcn_mov_dpp8 && !ST->hasDPP8()))
7952 return emitRemovedIntrinsicError(DAG, DL: SL, VT);
7953
7954 unsigned SplitSize = 32;
7955 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7956 ST->hasDPALU_DPP() &&
7957 AMDGPU::isLegalDPALU_DPPControl(ST: *ST, DC: N->getConstantOperandVal(Num: 3)))
7958 SplitSize = 64;
7959
7960 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7961 SDValue Src2, MVT ValT) -> SDValue {
7962 SmallVector<SDValue, 8> Operands;
7963 switch (IID) {
7964 case Intrinsic::amdgcn_permlane16:
7965 case Intrinsic::amdgcn_permlanex16:
7966 case Intrinsic::amdgcn_update_dpp:
7967 Operands.push_back(Elt: N->getOperand(Num: 6));
7968 Operands.push_back(Elt: N->getOperand(Num: 5));
7969 Operands.push_back(Elt: N->getOperand(Num: 4));
7970 [[fallthrough]];
7971 case Intrinsic::amdgcn_writelane:
7972 case Intrinsic::amdgcn_permlane_bcast:
7973 case Intrinsic::amdgcn_permlane_up:
7974 case Intrinsic::amdgcn_permlane_down:
7975 case Intrinsic::amdgcn_permlane_xor:
7976 Operands.push_back(Elt: Src2);
7977 [[fallthrough]];
7978 case Intrinsic::amdgcn_readlane:
7979 case Intrinsic::amdgcn_set_inactive:
7980 case Intrinsic::amdgcn_set_inactive_chain_arg:
7981 case Intrinsic::amdgcn_mov_dpp8:
7982 Operands.push_back(Elt: Src1);
7983 [[fallthrough]];
7984 case Intrinsic::amdgcn_readfirstlane:
7985 case Intrinsic::amdgcn_permlane64:
7986 Operands.push_back(Elt: Src0);
7987 break;
7988 default:
7989 llvm_unreachable("unhandled lane op");
7990 }
7991
7992 Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32));
7993 std::reverse(first: Operands.begin(), last: Operands.end());
7994
7995 if (SDNode *GL = N->getGluedNode()) {
7996 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7997 GL = GL->getOperand(Num: 0).getNode();
7998 Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7999 Operand: SDValue(GL, 0)));
8000 }
8001
8002 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands);
8003 };
8004
8005 SDValue Src0 = N->getOperand(Num: 1);
8006 SDValue Src1, Src2;
8007 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
8008 IID == Intrinsic::amdgcn_mov_dpp8 ||
8009 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
8010 IsPermlaneShuffle) {
8011 Src1 = N->getOperand(Num: 2);
8012 if (IID == Intrinsic::amdgcn_writelane ||
8013 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 ||
8014 IsPermlaneShuffle)
8015 Src2 = N->getOperand(Num: 3);
8016 }
8017
8018 if (ValSize == SplitSize) {
8019 // Already legal
8020 return SDValue();
8021 }
8022
8023 if (ValSize < 32) {
8024 bool IsFloat = VT.isFloatingPoint();
8025 Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0,
8026 DL: SL, VT: MVT::i32);
8027
8028 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
8029 Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1,
8030 DL: SL, VT: MVT::i32);
8031 }
8032
8033 if (IID == Intrinsic::amdgcn_writelane) {
8034 Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2,
8035 DL: SL, VT: MVT::i32);
8036 }
8037
8038 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
8039 SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT);
8040 return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc;
8041 }
8042
8043 if (ValSize % SplitSize != 0)
8044 return SDValue();
8045
8046 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
8047 EVT VT = N->getValueType(ResNo: 0);
8048 unsigned NE = VT.getVectorNumElements();
8049 EVT EltVT = VT.getVectorElementType();
8050 SmallVector<SDValue, 8> Scalars;
8051 unsigned NumOperands = N->getNumOperands();
8052 SmallVector<SDValue, 4> Operands(NumOperands);
8053 SDNode *GL = N->getGluedNode();
8054
8055 // only handle convergencectrl_glue
8056 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
8057
8058 for (unsigned i = 0; i != NE; ++i) {
8059 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
8060 ++j) {
8061 SDValue Operand = N->getOperand(Num: j);
8062 EVT OperandVT = Operand.getValueType();
8063 if (OperandVT.isVector()) {
8064 // A vector operand; extract a single element.
8065 EVT OperandEltVT = OperandVT.getVectorElementType();
8066 Operands[j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT,
8067 N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL));
8068 } else {
8069 // A scalar operand; just use it as is.
8070 Operands[j] = Operand;
8071 }
8072 }
8073
8074 if (GL)
8075 Operands[NumOperands - 1] =
8076 DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
8077 Operand: SDValue(GL->getOperand(Num: 0).getNode(), 0));
8078
8079 Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands));
8080 }
8081
8082 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE);
8083 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars);
8084 };
8085
8086 if (VT.isVector()) {
8087 switch (MVT::SimpleValueType EltTy =
8088 VT.getVectorElementType().getSimpleVT().SimpleTy) {
8089 case MVT::i32:
8090 case MVT::f32:
8091 if (SplitSize == 32) {
8092 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
8093 return unrollLaneOp(LaneOp.getNode());
8094 }
8095 [[fallthrough]];
8096 case MVT::i16:
8097 case MVT::f16:
8098 case MVT::bf16: {
8099 unsigned SubVecNumElt =
8100 SplitSize / VT.getVectorElementType().getSizeInBits();
8101 MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: SubVecNumElt);
8102 SmallVector<SDValue, 4> Pieces;
8103 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
8104 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
8105 Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0,
8106 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
8107
8108 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
8109 IsPermLane16) {
8110 Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1,
8111 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
8112
8113 Pieces.push_back(
8114 Elt: createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT));
8115 } else if (IID == Intrinsic::amdgcn_writelane) {
8116 Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2,
8117 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
8118 Pieces.push_back(
8119 Elt: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
8120 } else {
8121 Pieces.push_back(Elt: createLaneOp(Src0SubVec, Src1, Src2, SubVecVT));
8122 }
8123
8124 EltIdx += SubVecNumElt;
8125 }
8126 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces);
8127 }
8128 default:
8129 // Handle all other cases by bitcasting to i32 vectors
8130 break;
8131 }
8132 }
8133
8134 MVT VecVT =
8135 MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: SplitSize), NumElements: ValSize / SplitSize);
8136 Src0 = DAG.getBitcast(VT: VecVT, V: Src0);
8137
8138 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
8139 Src1 = DAG.getBitcast(VT: VecVT, V: Src1);
8140
8141 if (IID == Intrinsic::amdgcn_writelane)
8142 Src2 = DAG.getBitcast(VT: VecVT, V: Src2);
8143
8144 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
8145 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
8146 return DAG.getBitcast(VT, V: UnrolledLaneOp);
8147}
8148
8149static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
8150 SelectionDAG &DAG) {
8151 EVT VT = N->getValueType(ResNo: 0);
8152
8153 if (VT.getSizeInBits() != 32)
8154 return SDValue();
8155
8156 SDLoc SL(N);
8157
8158 SDValue Value = N->getOperand(Num: 1);
8159 SDValue Index = N->getOperand(Num: 2);
8160
8161 // ds_bpermute requires index to be multiplied by 4
8162 SDValue ShiftAmount = DAG.getShiftAmountConstant(Val: 2, VT: MVT::i32, DL: SL);
8163 SDValue ShiftedIndex =
8164 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: Index.getValueType(), N1: Index, N2: ShiftAmount);
8165
8166 // Intrinsics will require i32 to operate on
8167 SDValue ValueI32 = DAG.getBitcast(VT: MVT::i32, V: Value);
8168
8169 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
8170 SmallVector<SDValue> IntrinArgs) -> SDValue {
8171 SmallVector<SDValue> Operands(1);
8172 Operands[0] = DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32);
8173 Operands.append(RHS: IntrinArgs);
8174 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: RetVT, Ops: Operands);
8175 };
8176
8177 // If we can bpermute across the whole wave, then just do that
8178 if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
8179 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8180 {ShiftedIndex, ValueI32});
8181 return DAG.getBitcast(VT, V: BPermute);
8182 }
8183
8184 assert(TLI.getSubtarget()->isWave64());
8185
8186 // Otherwise, we need to make use of whole wave mode
8187 SDValue PoisonVal = DAG.getPOISON(VT: ValueI32->getValueType(ResNo: 0));
8188
8189 // Set inactive lanes to poison
8190 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8191 {ValueI32, PoisonVal});
8192 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8193 {ShiftedIndex, PoisonVal});
8194
8195 SDValue Swapped =
8196 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
8197
8198 // Get permutation of each half, then we'll select which one to use
8199 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8200 {WWMIndex, WWMValue});
8201 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
8202 MVT::i32, {WWMIndex, Swapped});
8203 SDValue BPermOtherHalfWWM =
8204 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
8205
8206 // Select which side to take the permute from
8207 SDValue ThreadIDMask = DAG.getAllOnesConstant(DL: SL, VT: MVT::i32);
8208 // We can get away with only using mbcnt_lo here since we're only
8209 // trying to detect which side of 32 each lane is on, and mbcnt_lo
8210 // returns 32 for lanes 32-63.
8211 SDValue ThreadID =
8212 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
8213 {ThreadIDMask, DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32)});
8214
8215 SDValue SameOrOtherHalf =
8216 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32,
8217 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: ThreadID, N2: Index),
8218 N2: DAG.getTargetConstant(Val: 32, DL: SL, VT: MVT::i32));
8219 SDValue UseSameHalf =
8220 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SameOrOtherHalf,
8221 RHS: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond: ISD::SETEQ);
8222 SDValue Result = DAG.getSelect(DL: SL, VT: MVT::i32, Cond: UseSameHalf, LHS: BPermSameHalf,
8223 RHS: BPermOtherHalfWWM);
8224 return DAG.getBitcast(VT, V: Result);
8225}
8226
8227void SITargetLowering::ReplaceNodeResults(SDNode *N,
8228 SmallVectorImpl<SDValue> &Results,
8229 SelectionDAG &DAG) const {
8230 switch (N->getOpcode()) {
8231 case ISD::INSERT_VECTOR_ELT: {
8232 if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
8233 Results.push_back(Elt: Res);
8234 return;
8235 }
8236 case ISD::EXTRACT_VECTOR_ELT: {
8237 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
8238 Results.push_back(Elt: Res);
8239 return;
8240 }
8241 case ISD::INTRINSIC_WO_CHAIN: {
8242 unsigned IID = N->getConstantOperandVal(Num: 0);
8243 switch (IID) {
8244 case Intrinsic::amdgcn_make_buffer_rsrc:
8245 Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
8246 return;
8247 case Intrinsic::amdgcn_cvt_pkrtz: {
8248 SDValue Src0 = N->getOperand(Num: 1);
8249 SDValue Src1 = N->getOperand(Num: 2);
8250 SDLoc SL(N);
8251 SDValue Cvt =
8252 DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
8253 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt));
8254 return;
8255 }
8256 case Intrinsic::amdgcn_cvt_pknorm_i16:
8257 case Intrinsic::amdgcn_cvt_pknorm_u16:
8258 case Intrinsic::amdgcn_cvt_pk_i16:
8259 case Intrinsic::amdgcn_cvt_pk_u16: {
8260 SDValue Src0 = N->getOperand(Num: 1);
8261 SDValue Src1 = N->getOperand(Num: 2);
8262 SDLoc SL(N);
8263 unsigned Opcode;
8264
8265 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
8266 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8267 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
8268 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8269 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
8270 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8271 else
8272 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8273
8274 EVT VT = N->getValueType(ResNo: 0);
8275 if (isTypeLegal(VT))
8276 Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
8277 else {
8278 SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
8279 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt));
8280 }
8281 return;
8282 }
8283 case Intrinsic::amdgcn_s_buffer_load: {
8284 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
8285 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
8286 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
8287 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
8288 // s_buffer_load_i8.
8289 if (!Subtarget->hasScalarSubwordLoads())
8290 return;
8291 SDValue Op = SDValue(N, 0);
8292 SDValue Rsrc = Op.getOperand(i: 1);
8293 SDValue Offset = Op.getOperand(i: 2);
8294 SDValue CachePolicy = Op.getOperand(i: 3);
8295 EVT VT = Op.getValueType();
8296 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
8297 SDLoc DL(Op);
8298 MachineFunction &MF = DAG.getMachineFunction();
8299 const DataLayout &DataLayout = DAG.getDataLayout();
8300 Align Alignment =
8301 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
8302 MachineMemOperand *MMO = MF.getMachineMemOperand(
8303 PtrInfo: MachinePointerInfo(),
8304 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
8305 MachineMemOperand::MOInvariant,
8306 Size: VT.getStoreSize(), BaseAlignment: Alignment);
8307 SDValue LoadVal;
8308 if (!Offset->isDivergent()) {
8309 SDValue Ops[] = {Rsrc, // source register
8310 Offset, CachePolicy};
8311 SDValue BufferLoad =
8312 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL,
8313 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
8314 LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
8315 } else {
8316 SDValue Ops[] = {
8317 DAG.getEntryNode(), // Chain
8318 Rsrc, // rsrc
8319 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
8320 {}, // voffset
8321 {}, // soffset
8322 {}, // offset
8323 CachePolicy, // cachepolicy
8324 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
8325 };
8326 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
8327 LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
8328 }
8329 Results.push_back(Elt: LoadVal);
8330 return;
8331 }
8332 case Intrinsic::amdgcn_dead: {
8333 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
8334 Results.push_back(Elt: DAG.getPOISON(VT: N->getValueType(ResNo: I)));
8335 return;
8336 }
8337 }
8338 break;
8339 }
8340 case ISD::INTRINSIC_W_CHAIN: {
8341 if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) {
8342 if (Res.getOpcode() == ISD::MERGE_VALUES) {
8343 // FIXME: Hacky
8344 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
8345 Results.push_back(Elt: Res.getOperand(i: I));
8346 }
8347 } else {
8348 Results.push_back(Elt: Res);
8349 Results.push_back(Elt: Res.getValue(R: 1));
8350 }
8351 return;
8352 }
8353
8354 break;
8355 }
8356 case ISD::SELECT: {
8357 SDLoc SL(N);
8358 EVT VT = N->getValueType(ResNo: 0);
8359 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
8360 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1));
8361 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2));
8362
8363 EVT SelectVT = NewVT;
8364 if (NewVT.bitsLT(VT: MVT::i32)) {
8365 LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS);
8366 RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS);
8367 SelectVT = MVT::i32;
8368 }
8369
8370 SDValue NewSelect =
8371 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
8372
8373 if (NewVT != SelectVT)
8374 NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
8375 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
8376 return;
8377 }
8378 case ISD::FNEG: {
8379 if (N->getValueType(ResNo: 0) != MVT::v2f16)
8380 break;
8381
8382 SDLoc SL(N);
8383 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
8384
8385 SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: BC,
8386 N2: DAG.getConstant(Val: 0x80008000, DL: SL, VT: MVT::i32));
8387 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
8388 return;
8389 }
8390 case ISD::FABS: {
8391 if (N->getValueType(ResNo: 0) != MVT::v2f16)
8392 break;
8393
8394 SDLoc SL(N);
8395 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
8396
8397 SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: BC,
8398 N2: DAG.getConstant(Val: 0x7fff7fff, DL: SL, VT: MVT::i32));
8399 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
8400 return;
8401 }
8402 case ISD::FSQRT: {
8403 if (N->getValueType(ResNo: 0) != MVT::f16)
8404 break;
8405 Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG));
8406 break;
8407 }
8408 default:
8409 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
8410 break;
8411 }
8412}
8413
8414/// Helper function for LowerBRCOND
8415static SDNode *findUser(SDValue Value, unsigned Opcode) {
8416
8417 for (SDUse &U : Value->uses()) {
8418 if (U.get() != Value)
8419 continue;
8420
8421 if (U.getUser()->getOpcode() == Opcode)
8422 return U.getUser();
8423 }
8424 return nullptr;
8425}
8426
8427unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
8428 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
8429 switch (Intr->getConstantOperandVal(Num: 1)) {
8430 case Intrinsic::amdgcn_if:
8431 return AMDGPUISD::IF;
8432 case Intrinsic::amdgcn_else:
8433 return AMDGPUISD::ELSE;
8434 case Intrinsic::amdgcn_loop:
8435 return AMDGPUISD::LOOP;
8436 case Intrinsic::amdgcn_end_cf:
8437 llvm_unreachable("should not occur");
8438 default:
8439 return 0;
8440 }
8441 }
8442
8443 // break, if_break, else_break are all only used as inputs to loop, not
8444 // directly as branch conditions.
8445 return 0;
8446}
8447
8448bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
8449 const Triple &TT = getTargetMachine().getTargetTriple();
8450 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
8451 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
8452 AMDGPU::shouldEmitConstantsToTextSection(TT);
8453}
8454
8455bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
8456 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8457 return false;
8458
8459 // FIXME: Either avoid relying on address space here or change the default
8460 // address space for functions to avoid the explicit check.
8461 return (GV->getValueType()->isFunctionTy() ||
8462 !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
8463 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
8464}
8465
8466bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
8467 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
8468}
8469
8470bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
8471 if (!GV->hasExternalLinkage())
8472 return true;
8473
8474 // With object linking, external LDS declarations need relocations so the
8475 // linker can assign their offsets.
8476 if (AMDGPUTargetMachine::EnableObjectLinking) {
8477 if (const auto *GVar = dyn_cast<GlobalVariable>(Val: GV)) {
8478 if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
8479 assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "
8480 "when object linking is enabled");
8481 return false;
8482 }
8483 }
8484 }
8485
8486 const auto OS = getTargetMachine().getTargetTriple().getOS();
8487 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
8488}
8489
8490/// This transforms the control flow intrinsics to get the branch destination as
8491/// last parameter, also switches branch target with BR if the need arise
8492SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
8493 SDLoc DL(BRCOND);
8494
8495 SDNode *Intr = BRCOND.getOperand(i: 1).getNode();
8496 SDValue Target = BRCOND.getOperand(i: 2);
8497 SDNode *BR = nullptr;
8498 SDNode *SetCC = nullptr;
8499
8500 switch (Intr->getOpcode()) {
8501 case ISD::SETCC: {
8502 // As long as we negate the condition everything is fine
8503 SetCC = Intr;
8504 Intr = SetCC->getOperand(Num: 0).getNode();
8505 break;
8506 }
8507 case ISD::XOR: {
8508 // Similar to SETCC, if we have (xor c, -1), we will be fine.
8509 SDValue LHS = Intr->getOperand(Num: 0);
8510 SDValue RHS = Intr->getOperand(Num: 1);
8511 if (auto *C = dyn_cast<ConstantSDNode>(Val&: RHS); C && C->getZExtValue()) {
8512 Intr = LHS.getNode();
8513 break;
8514 }
8515 [[fallthrough]];
8516 }
8517 default: {
8518 // Get the target from BR if we don't negate the condition
8519 BR = findUser(Value: BRCOND, Opcode: ISD::BR);
8520 assert(BR && "brcond missing unconditional branch user");
8521 Target = BR->getOperand(Num: 1);
8522 }
8523 }
8524
8525 unsigned CFNode = isCFIntrinsic(Intr);
8526 if (CFNode == 0) {
8527 // This is a uniform branch so we don't need to legalize.
8528 return BRCOND;
8529 }
8530
8531 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
8532 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
8533
8534 assert(!SetCC ||
8535 (SetCC->getConstantOperandVal(1) == 1 &&
8536 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
8537 ISD::SETNE));
8538
8539 // operands of the new intrinsic call
8540 SmallVector<SDValue, 4> Ops;
8541 if (HaveChain)
8542 Ops.push_back(Elt: BRCOND.getOperand(i: 0));
8543
8544 Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end());
8545 Ops.push_back(Elt: Target);
8546
8547 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
8548
8549 // build the new intrinsic call
8550 SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
8551
8552 if (!HaveChain) {
8553 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(i: 0)};
8554
8555 Result = DAG.getMergeValues(Ops, dl: DL).getNode();
8556 }
8557
8558 if (BR) {
8559 // Give the branch instruction our target
8560 SDValue Ops[] = {BR->getOperand(Num: 0), BRCOND.getOperand(i: 2)};
8561 SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
8562 DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
8563 }
8564
8565 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
8566
8567 // Copy the intrinsic results to registers
8568 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
8569 SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg);
8570 if (!CopyToReg)
8571 continue;
8572
8573 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: CopyToReg->getOperand(Num: 1),
8574 N: SDValue(Result, i - 1), Glue: SDValue());
8575
8576 DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0));
8577 }
8578
8579 // Remove the old intrinsic from the chain
8580 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Intr, Intr->getNumValues() - 1),
8581 To: Intr->getOperand(Num: 0));
8582
8583 return Chain;
8584}
8585
8586SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8587 MVT VT = Op.getSimpleValueType();
8588 SDLoc DL(Op);
8589 // Checking the depth
8590 if (Op.getConstantOperandVal(i: 0) != 0)
8591 return DAG.getConstant(Val: 0, DL, VT);
8592
8593 MachineFunction &MF = DAG.getMachineFunction();
8594 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8595 // Check for kernel and shader functions
8596 if (Info->isEntryFunction())
8597 return DAG.getConstant(Val: 0, DL, VT);
8598
8599 MachineFrameInfo &MFI = MF.getFrameInfo();
8600 // There is a call to @llvm.returnaddress in this function
8601 MFI.setReturnAddressIsTaken(true);
8602
8603 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8604 // Get the return address reg and mark it as an implicit live-in
8605 Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF),
8606 RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
8607
8608 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
8609}
8610
8611SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8612 MachineFunction &MF = DAG.getMachineFunction();
8613 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8614
8615 // For functions that set up their own stack, select the GET_STACK_BASE
8616 // pseudo.
8617 if (MFI->isBottomOfStack())
8618 return Op;
8619
8620 // For everything else, create a dummy stack object.
8621 int FI = MF.getFrameInfo().CreateFixedObject(Size: 1, SPOffset: 0, /*IsImmutable=*/false);
8622 return DAG.getFrameIndex(FI, VT: Op.getValueType());
8623}
8624
8625SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8626 const SDLoc &DL, EVT VT) const {
8627 return Op.getValueType().bitsLE(VT)
8628 ? DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op)
8629 : DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op,
8630 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
8631}
8632
8633SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8634 SelectionDAG &DAG) const {
8635 EVT DstVT = Op.getValueType();
8636 unsigned NumElts = DstVT.getVectorNumElements();
8637 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8638
8639 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
8640
8641 SDLoc DL(Op);
8642 unsigned Opc = Op.getOpcode();
8643 SDValue Flags = Op.getOperand(i: 1);
8644 EVT HalfDstVT =
8645 EVT::getVectorVT(Context&: *DAG.getContext(), VT: DstVT.getScalarType(), NumElements: NumElts / 2);
8646 SDValue OpLo = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Lo, N2: Flags);
8647 SDValue OpHi = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Hi, N2: Flags);
8648
8649 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: OpLo, N2: OpHi);
8650}
8651
8652SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8653 SDValue Src = Op.getOperand(i: 0);
8654 EVT SrcVT = Src.getValueType();
8655 EVT DstVT = Op.getValueType();
8656
8657 if (DstVT.isVectorOf(EltVT: MVT::f16)) {
8658 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8659 if (SrcVT.getScalarType() != MVT::f32)
8660 return SDValue();
8661 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8662 }
8663
8664 if (SrcVT.getScalarType() != MVT::f64)
8665 return Op;
8666
8667 SDLoc DL(Op);
8668 if (DstVT == MVT::f16) {
8669 // TODO: Handle strictfp
8670 if (Op.getOpcode() != ISD::FP_ROUND)
8671 return Op;
8672
8673 if (!Subtarget->has16BitInsts()) {
8674 SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src);
8675 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8676 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8677 }
8678 if (Op->getFlags().hasApproximateFuncs()) {
8679 SDValue Flags = Op.getOperand(i: 1);
8680 SDValue Src32 = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f32, N1: Src, N2: Flags);
8681 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: Src32, N2: Flags);
8682 }
8683 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8684 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8685 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8686 }
8687
8688 assert(DstVT.getScalarType() == MVT::bf16 &&
8689 "custom lower FP_ROUND for f16 or bf16");
8690 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8691
8692 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8693 // hardware f32 -> bf16 instruction.
8694 EVT F32VT = SrcVT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::f32);
8695 SDValue Rod = expandRoundInexactToOdd(ResultVT: F32VT, Op: Src, DL, DAG);
8696 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: DstVT, N1: Rod,
8697 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
8698}
8699
8700SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8701 SelectionDAG &DAG) const {
8702 EVT VT = Op.getValueType();
8703 const MachineFunction &MF = DAG.getMachineFunction();
8704 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8705 bool IsIEEEMode = Info->getMode().IEEE;
8706
8707 // FIXME: Assert during selection that this is only selected for
8708 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8709 // mode functions, but this happens to be OK since it's only done in cases
8710 // where there is known no sNaN.
8711 if (IsIEEEMode)
8712 return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
8713
8714 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8715 VT == MVT::v16bf16)
8716 return splitBinaryVectorOp(Op, DAG);
8717 return Op;
8718}
8719
8720SDValue
8721SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8722 SelectionDAG &DAG) const {
8723 EVT VT = Op.getValueType();
8724 const MachineFunction &MF = DAG.getMachineFunction();
8725 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8726 bool IsIEEEMode = Info->getMode().IEEE;
8727
8728 if (IsIEEEMode)
8729 return expandFMINIMUMNUM_FMAXIMUMNUM(N: Op.getNode(), DAG);
8730
8731 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8732 VT == MVT::v16bf16)
8733 return splitBinaryVectorOp(Op, DAG);
8734 return Op;
8735}
8736
8737SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8738 SelectionDAG &DAG) const {
8739 EVT VT = Op.getValueType();
8740 if (VT.isVector())
8741 return splitBinaryVectorOp(Op, DAG);
8742
8743 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8744 !Subtarget->hasMinimum3Maximum3F16() &&
8745 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8746 "should not need to widen f16 minimum/maximum to v2f16");
8747
8748 // Widen f16 operation to v2f16
8749
8750 // fminimum f16:x, f16:y ->
8751 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8752 // (v2f16 (scalar_to_vector y))), 0
8753 SDLoc SL(Op);
8754 SDValue WideSrc0 =
8755 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 0));
8756 SDValue WideSrc1 =
8757 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 1));
8758
8759 SDValue Widened =
8760 DAG.getNode(Opcode: Op.getOpcode(), DL: SL, VT: MVT::v2f16, N1: WideSrc0, N2: WideSrc1);
8761
8762 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::f16, N1: Widened,
8763 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
8764}
8765
8766SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8767 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8768 EVT VT = Op.getValueType();
8769 assert(VT == MVT::f16);
8770
8771 SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1);
8772 EVT ExpVT = Exp.getValueType();
8773 if (ExpVT == MVT::i16)
8774 return Op;
8775
8776 SDLoc DL(Op);
8777
8778 // Correct the exponent type for f16 to i16.
8779 // Clamp the range of the exponent to the instruction's range.
8780
8781 // TODO: This should be a generic narrowing legalization, and can easily be
8782 // for GlobalISel.
8783
8784 SDValue MinExp = DAG.getSignedConstant(Val: minIntN(N: 16), DL, VT: ExpVT);
8785 SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
8786
8787 SDValue MaxExp = DAG.getSignedConstant(Val: maxIntN(N: 16), DL, VT: ExpVT);
8788 SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
8789
8790 SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp);
8791
8792 if (IsStrict) {
8793 return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other},
8794 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), TruncExp});
8795 }
8796
8797 return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp);
8798}
8799
8800static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
8801 switch (Op->getOpcode()) {
8802 case ISD::ABS:
8803 case ISD::SRA:
8804 case ISD::SMIN:
8805 case ISD::SMAX:
8806 return ISD::SIGN_EXTEND;
8807 case ISD::SRL:
8808 case ISD::UMIN:
8809 case ISD::UMAX:
8810 case ISD::USUBSAT:
8811 return ISD::ZERO_EXTEND;
8812 case ISD::ADD:
8813 case ISD::SUB:
8814 case ISD::AND:
8815 case ISD::OR:
8816 case ISD::XOR:
8817 case ISD::SHL:
8818 case ISD::SELECT:
8819 case ISD::MUL:
8820 // operation result won't be influenced by garbage high bits.
8821 // TODO: are all of those cases correct, and are there more?
8822 return ISD::ANY_EXTEND;
8823 case ISD::SETCC: {
8824 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
8825 return ISD::isSignedIntSetCC(Code: CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8826 }
8827 default:
8828 llvm_unreachable("unexpected opcode!");
8829 }
8830}
8831
8832SDValue
8833SITargetLowering::promoteUniformUnaryOpToI32(SDValue Op,
8834 DAGCombinerInfo &DCI) const {
8835 EVT OpTy = Op.getValueType();
8836 SelectionDAG &DAG = DCI.DAG;
8837 EVT ExtTy = OpTy.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
8838
8839 if (isNarrowingProfitable(N: Op.getNode(), SrcVT: ExtTy, DestVT: OpTy))
8840 return SDValue();
8841
8842 SDLoc DL(Op);
8843 SDValue Input = Op.getOperand(i: 0);
8844 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8845 Input = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: Input);
8846
8847 SDValue NewVal = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: ExtTy, Operand: Input);
8848
8849 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: OpTy, Operand: NewVal);
8850}
8851
8852SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8853 DAGCombinerInfo &DCI) const {
8854 const unsigned Opc = Op.getOpcode();
8855 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8856 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8857 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8858 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8859 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX ||
8860 Opc == ISD::USUBSAT);
8861
8862 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8863 : Op->getOperand(Num: 0).getValueType();
8864 auto &DAG = DCI.DAG;
8865 auto ExtTy = OpTy.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
8866
8867 if (DCI.isBeforeLegalizeOps() ||
8868 isNarrowingProfitable(N: Op.getNode(), SrcVT: ExtTy, DestVT: OpTy))
8869 return SDValue();
8870
8871 SDLoc DL(Op);
8872 SDValue LHS;
8873 SDValue RHS;
8874 if (Opc == ISD::SELECT) {
8875 LHS = Op->getOperand(Num: 1);
8876 RHS = Op->getOperand(Num: 2);
8877 } else {
8878 LHS = Op->getOperand(Num: 0);
8879 RHS = Op->getOperand(Num: 1);
8880 }
8881
8882 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8883 LHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {LHS});
8884
8885 // Special case: for shifts, the RHS always needs a zext.
8886 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8887 RHS = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: ExtTy, Operand: {RHS});
8888 else
8889 RHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {RHS});
8890
8891 // setcc always return i1/i1 vec so no need to truncate after.
8892 if (Opc == ISD::SETCC) {
8893 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
8894 return DAG.getSetCC(DL, VT: Op.getValueType(), LHS, RHS, Cond: CC);
8895 }
8896
8897 // For other ops, we extend the operation's return type as well so we need to
8898 // truncate back to the original type.
8899 SDValue NewVal;
8900 if (Opc == ISD::SELECT)
8901 NewVal = DAG.getNode(Opcode: ISD::SELECT, DL, VT: ExtTy, Ops: {Op->getOperand(Num: 0), LHS, RHS});
8902 else
8903 NewVal = DAG.getNode(Opcode: Opc, DL, VT: ExtTy, Ops: {LHS, RHS});
8904
8905 return DAG.getZExtOrTrunc(Op: NewVal, DL, VT: OpTy);
8906}
8907
8908SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8909 SDValue Mag = Op.getOperand(i: 0);
8910 EVT MagVT = Mag.getValueType();
8911
8912 if (MagVT.getVectorNumElements() > 2)
8913 return splitBinaryVectorOp(Op, DAG);
8914
8915 SDValue Sign = Op.getOperand(i: 1);
8916 EVT SignVT = Sign.getValueType();
8917
8918 if (MagVT == SignVT)
8919 return Op;
8920
8921 // fcopysign v2f16:mag, v2f32:sign ->
8922 // fcopysign v2f16:mag,
8923 // bitcast (trunc (srl (bitcast sign to v2i32), 16) to v2i16)
8924
8925 SDLoc SL(Op);
8926 SDValue SignAsInt32 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Sign);
8927 SDValue ShiftAmt = DAG.getShiftAmountConstant(Val: 16, VT: MVT::v2i32, DL: SL);
8928 SDValue SignShifted =
8929 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::v2i32, N1: SignAsInt32, N2: ShiftAmt);
8930 SDValue SignAsInt16 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::v2i16, Operand: SignShifted);
8931
8932 SDValue SignAsHalf16 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MagVT, Operand: SignAsInt16);
8933
8934 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MagVT, N1: Mag, N2: SignAsHalf16);
8935}
8936
8937// Custom lowering for vector multiplications and s_mul_u64.
8938SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8939 EVT VT = Op.getValueType();
8940
8941 // Split vector operands.
8942 if (VT.isVector())
8943 return splitBinaryVectorOp(Op, DAG);
8944
8945 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8946
8947 // There are four ways to lower s_mul_u64:
8948 //
8949 // 1. If all the operands are uniform, then we lower it as it is.
8950 //
8951 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8952 // multiplications because there is not a vector equivalent of s_mul_u64.
8953 //
8954 // 3. If the cost model decides that it is more efficient to use vector
8955 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8956 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8957 //
8958 // 4. If the cost model decides to use vector registers and both of the
8959 // operands are zero-extended/sign-extended from 32-bits, then we split the
8960 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8961 // possible to check if the operands are zero-extended or sign-extended in
8962 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8963 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8964 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8965 // If the cost model decides that we have to use vector registers, then
8966 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8967 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8968 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8969 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8970 // SIInstrInfo.cpp .
8971
8972 if (Op->isDivergent())
8973 return SDValue();
8974
8975 SDValue Op0 = Op.getOperand(i: 0);
8976 SDValue Op1 = Op.getOperand(i: 1);
8977 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8978 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8979 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8980 KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
8981 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8982 KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
8983 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8984 SDLoc SL(Op);
8985 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8986 return SDValue(
8987 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
8988 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
8989 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
8990 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8991 return SDValue(
8992 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
8993 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8994 return Op;
8995}
8996
8997SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8998 EVT VT = Op.getValueType();
8999 SDLoc SL(Op);
9000 SDValue LHS = Op.getOperand(i: 0);
9001 SDValue RHS = Op.getOperand(i: 1);
9002 bool isSigned = Op.getOpcode() == ISD::SMULO;
9003
9004 if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
9005 const APInt &C = RHSC->getAPIntValue();
9006 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
9007 if (C.isPowerOf2()) {
9008 // smulo(x, signed_min) is same as umulo(x, signed_min).
9009 bool UseArithShift = isSigned && !C.isMinSignedValue();
9010 SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32);
9011 SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
9012 SDValue Overflow =
9013 DAG.getSetCC(DL: SL, VT: MVT::i1,
9014 LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL, DL: SL, VT,
9015 N1: Result, N2: ShiftAmt),
9016 RHS: LHS, Cond: ISD::SETNE);
9017 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
9018 }
9019 }
9020
9021 SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
9022 SDValue Top =
9023 DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, DL: SL, VT, N1: LHS, N2: RHS);
9024
9025 SDValue Sign = isSigned
9026 ? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result,
9027 N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - 1,
9028 DL: SL, VT: MVT::i32))
9029 : DAG.getConstant(Val: 0, DL: SL, VT);
9030 SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE);
9031
9032 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
9033}
9034
9035SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
9036 if (Op->isDivergent()) {
9037 // Select to V_MAD_[IU]64_[IU]32.
9038 return Op;
9039 }
9040 if (Subtarget->hasSMulHi()) {
9041 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
9042 return SDValue();
9043 }
9044 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
9045 // calculate the high part, so we might as well do the whole thing with
9046 // V_MAD_[IU]64_[IU]32.
9047 return Op;
9048}
9049
9050SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
9051 if (!Subtarget->hasTrapHandler() ||
9052 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
9053 return lowerTrapEndpgm(Op, DAG);
9054
9055 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
9056 : lowerTrapHsaQueuePtr(Op, DAG);
9057}
9058
9059SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
9060 SDLoc SL(Op);
9061 SDValue Chain = Op.getOperand(i: 0);
9062 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
9063}
9064
9065SDValue
9066SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
9067 const SDLoc &DL, Align Alignment,
9068 ImplicitParameter Param) const {
9069 MachineFunction &MF = DAG.getMachineFunction();
9070 uint64_t Offset = getImplicitParameterOffset(MF, Param);
9071 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
9072 MachinePointerInfo PtrInfo =
9073 getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
9074 return DAG.getLoad(
9075 VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
9076 MMOFlags: MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
9077}
9078
9079SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
9080 SelectionDAG &DAG) const {
9081 SDLoc SL(Op);
9082 SDValue Chain = Op.getOperand(i: 0);
9083
9084 SDValue QueuePtr;
9085 // For code object version 5, QueuePtr is passed through implicit kernarg.
9086 const Module *M = DAG.getMachineFunction().getFunction().getParent();
9087 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
9088 QueuePtr =
9089 loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align(8), Param: QUEUE_PTR);
9090 } else {
9091 MachineFunction &MF = DAG.getMachineFunction();
9092 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9093 Register UserSGPR = Info->getQueuePtrUserSGPR();
9094
9095 if (UserSGPR == AMDGPU::NoRegister) {
9096 // We probably are in a function incorrectly marked with
9097 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
9098 // trap, so just use a null pointer.
9099 QueuePtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
9100 } else {
9101 QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR,
9102 VT: MVT::i64);
9103 }
9104 }
9105
9106 SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64);
9107 SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, N: QueuePtr, Glue: SDValue());
9108
9109 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
9110 SDValue Ops[] = {ToReg, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16), SGPR01,
9111 ToReg.getValue(R: 1)};
9112 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
9113}
9114
9115SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
9116 SDLoc SL(Op);
9117 SDValue Chain = Op.getOperand(i: 0);
9118
9119 // We need to simulate the 's_trap 2' instruction on targets that run in
9120 // PRIV=1 (where it is treated as a nop).
9121 if (Subtarget->hasPrivEnabledTrap2NopBug())
9122 return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
9123
9124 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
9125 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
9126 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
9127}
9128
9129SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
9130 SDLoc SL(Op);
9131 SDValue Chain = Op.getOperand(i: 0);
9132 MachineFunction &MF = DAG.getMachineFunction();
9133
9134 if (!Subtarget->hasTrapHandler() ||
9135 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
9136 LLVMContext &Ctx = MF.getFunction().getContext();
9137 Ctx.diagnose(DI: DiagnosticInfoUnsupported(MF.getFunction(),
9138 "debugtrap handler not supported",
9139 Op.getDebugLoc(), DS_Warning));
9140 return Chain;
9141 }
9142
9143 uint64_t TrapID =
9144 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
9145 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
9146 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
9147}
9148
9149/// When a divergent value (in VGPR) is passed to an inline asm with an SGPR
9150/// constraint ('s'), we need to insert v_readfirstlane to move the value from
9151/// VGPR to SGPR. This is done by modifying the CopyToReg nodes in the glue
9152/// chain that feed into the INLINEASM node.
9153SDValue SITargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
9154 unsigned NumOps = Op.getNumOperands();
9155
9156 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
9157 SmallSet<Register, 8> SGPRInputRegs;
9158
9159 unsigned NumVals = 0;
9160 for (unsigned I = InlineAsm::Op_FirstOperand; I < NumOps - 1;
9161 I += 1 + NumVals) {
9162 const InlineAsm::Flag Flags(Op.getConstantOperandVal(i: I));
9163 NumVals = Flags.getNumOperandRegisters();
9164
9165 unsigned RCID;
9166 bool IsSGPRInput = Flags.getKind() == InlineAsm::Kind::RegUse &&
9167 NumVals > 0 && Flags.hasRegClassConstraint(RC&: RCID) &&
9168 TRI->isSGPRClass(RC: TRI->getRegClass(i: RCID));
9169
9170 for (unsigned J = 0; J < NumVals; ++J) {
9171 SDValue Val = Op.getOperand(i: I + 1 + J);
9172 if (const RegisterSDNode *RegNode =
9173 dyn_cast<RegisterSDNode>(Val: Val.getNode())) {
9174 Register Reg = RegNode->getReg();
9175 if (IsSGPRInput || (Reg.isPhysical() && TRI->isSGPRPhysReg(Reg)))
9176 SGPRInputRegs.insert(V: Reg);
9177 }
9178 }
9179 }
9180
9181 if (SGPRInputRegs.empty())
9182 return Op;
9183
9184 // Walk the glue chain and insert readfirstlane for divergent SGPR inputs.
9185 SDLoc DL(Op);
9186 SDNode *N = Op.getOperand(i: NumOps - 1).getNode();
9187
9188 while (N && N->getOpcode() == ISD::CopyToReg) {
9189 Register Reg = cast<RegisterSDNode>(Val: N->getOperand(Num: 1))->getReg();
9190 SDValue SrcVal = N->getOperand(Num: 2);
9191
9192 // Insert readfirstlane if copying a divergent value to an SGPR input.
9193 if (SrcVal->isDivergent() && SGPRInputRegs.count(V: Reg)) {
9194 SDValue ReadFirstLaneID =
9195 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
9196 SDValue ReadFirstLane =
9197 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: SrcVal.getValueType(),
9198 N1: ReadFirstLaneID, N2: SrcVal);
9199
9200 SmallVector<SDValue, 4> Ops = {N->getOperand(Num: 0), N->getOperand(Num: 1),
9201 ReadFirstLane};
9202 if (N->getNumOperands() > 3)
9203 Ops.push_back(Elt: N->getOperand(Num: 3)); // Glue input
9204
9205 DAG.UpdateNodeOperands(N, Ops);
9206 }
9207
9208 // Follow glue chain to next CopyToReg.
9209 SDNode *Next = nullptr;
9210 for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) {
9211 if (N->getOperand(Num: I).getValueType() == MVT::Glue) {
9212 Next = N->getOperand(Num: I).getNode();
9213 break;
9214 }
9215 }
9216 N = Next;
9217 }
9218
9219 return Op;
9220}
9221
9222SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
9223 SelectionDAG &DAG) const {
9224 if (Subtarget->hasApertureRegs()) {
9225 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
9226 ? AMDGPU::SRC_SHARED_BASE
9227 : AMDGPU::SRC_PRIVATE_BASE;
9228 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
9229 !Subtarget->hasGloballyAddressableScratch()) &&
9230 "Cannot use src_private_base with globally addressable scratch!");
9231 // Note: this feature (register) is broken. When used as a 32-bit operand,
9232 // it returns a wrong value (all zeroes?). The real value is in the upper 32
9233 // bits.
9234 //
9235 // To work around the issue, emit a 64 bit copy from this register
9236 // then extract the high bits. Note that this shouldn't even result in a
9237 // shift being emitted and simply become a pair of registers (e.g.):
9238 // s_mov_b64 s[6:7], src_shared_base
9239 // v_mov_b32_e32 v1, s7
9240 SDValue Copy =
9241 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ApertureRegNo, VT: MVT::v2i32);
9242 return DAG.getExtractVectorElt(DL, VT: MVT::i32, Vec: Copy, Idx: 1);
9243 }
9244
9245 // For code object version 5, private_base and shared_base are passed through
9246 // implicit kernargs.
9247 const Module *M = DAG.getMachineFunction().getFunction().getParent();
9248 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
9249 ImplicitParameter Param =
9250 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
9251 return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align(4), Param);
9252 }
9253
9254 MachineFunction &MF = DAG.getMachineFunction();
9255 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9256 Register UserSGPR = Info->getQueuePtrUserSGPR();
9257 if (UserSGPR == AMDGPU::NoRegister) {
9258 // We probably are in a function incorrectly marked with
9259 // amdgpu-no-queue-ptr. This is undefined.
9260 return DAG.getPOISON(VT: MVT::i32);
9261 }
9262
9263 SDValue QueuePtr =
9264 CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64);
9265
9266 // Offset into amd_queue_t for group_segment_aperture_base_hi /
9267 // private_segment_aperture_base_hi.
9268 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
9269
9270 SDValue Ptr =
9271 DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
9272
9273 // TODO: Use custom target PseudoSourceValue.
9274 // TODO: We should use the value from the IR intrinsic call, but it might not
9275 // be available and how do we get it?
9276 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
9277 return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: 1), Ptr, PtrInfo,
9278 Alignment: commonAlignment(A: Align(64), Offset: StructOffset),
9279 MMOFlags: MachineMemOperand::MODereferenceable |
9280 MachineMemOperand::MOInvariant);
9281}
9282
9283/// Return true if the value is a known valid address, such that a null check is
9284/// not necessary.
9285static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
9286 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
9287 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
9288 return true;
9289
9290 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
9291 return ConstVal->getSExtValue() != AMDGPU::getNullPointerValue(AS: AddrSpace);
9292
9293 // TODO: Search through arithmetic, handle arguments and loads
9294 // marked nonnull.
9295 return false;
9296}
9297
9298SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
9299 SelectionDAG &DAG) const {
9300 SDLoc SL(Op);
9301
9302 const AMDGPUTargetMachine &TM =
9303 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
9304
9305 unsigned DestAS, SrcAS;
9306 SDValue Src;
9307 bool IsNonNull = false;
9308 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
9309 SrcAS = ASC->getSrcAddressSpace();
9310 Src = ASC->getOperand(Num: 0);
9311 DestAS = ASC->getDestAddressSpace();
9312 } else {
9313 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
9314 Op.getConstantOperandVal(0) ==
9315 Intrinsic::amdgcn_addrspacecast_nonnull);
9316 Src = Op->getOperand(Num: 1);
9317 SrcAS = Op->getConstantOperandVal(Num: 2);
9318 DestAS = Op->getConstantOperandVal(Num: 3);
9319 IsNonNull = true;
9320 }
9321
9322 SDValue FlatNullPtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
9323
9324 // flat -> local/private
9325 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
9326 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
9327 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
9328 SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
9329
9330 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
9331 Subtarget->hasGloballyAddressableScratch()) {
9332 // flat -> private with globally addressable scratch: subtract
9333 // src_flat_scratch_base_lo.
9334 SDValue FlatScratchBaseLo(
9335 DAG.getMachineNode(
9336 Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32,
9337 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, VT: MVT::i32)),
9338 0);
9339 Ptr = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: Ptr, N2: FlatScratchBaseLo);
9340 }
9341
9342 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
9343 return Ptr;
9344
9345 unsigned NullVal = AMDGPU::getNullPointerValue(AS: DestAS);
9346 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
9347 SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE);
9348
9349 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr,
9350 N3: SegmentNullPtr);
9351 }
9352 }
9353
9354 // local/private -> flat
9355 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
9356 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
9357 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
9358 SDValue CvtPtr;
9359 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
9360 Subtarget->hasGloballyAddressableScratch()) {
9361 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
9362 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
9363 SDValue AllOnes = DAG.getSignedTargetConstant(Val: -1, DL: SL, VT: MVT::i32);
9364 SDValue ThreadID = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
9365 ThreadID = DAG.getNode(
9366 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
9367 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_lo, DL: SL, VT: MVT::i32),
9368 N2: AllOnes, N3: ThreadID);
9369 if (Subtarget->isWave64())
9370 ThreadID = DAG.getNode(
9371 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
9372 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_hi, DL: SL, VT: MVT::i32),
9373 N2: AllOnes, N3: ThreadID);
9374 SDValue ShAmt = DAG.getShiftAmountConstant(
9375 Val: 57 - 32 - Subtarget->getWavefrontSizeLog2(), VT: MVT::i32, DL: SL);
9376 SDValue SrcHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ThreadID, N2: ShAmt);
9377 CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: SrcHi);
9378 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
9379 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
9380 // 64-bit hi:lo value.
9381 SDValue FlatScratchBase = {
9382 DAG.getMachineNode(
9383 Opcode: AMDGPU::S_MOV_B64, dl: SL, VT: MVT::i64,
9384 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE, VT: MVT::i64)),
9385 0};
9386 CvtPtr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: CvtPtr, N2: FlatScratchBase);
9387 } else {
9388 SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
9389 CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture);
9390 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
9391 }
9392
9393 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
9394 return CvtPtr;
9395
9396 unsigned NullVal = AMDGPU::getNullPointerValue(AS: SrcAS);
9397 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
9398
9399 SDValue NonNull =
9400 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE);
9401
9402 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr,
9403 N3: FlatNullPtr);
9404 }
9405 }
9406
9407 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
9408 Op.getValueType() == MVT::i64) {
9409 const SIMachineFunctionInfo *Info =
9410 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
9411 if (Info->get32BitAddressHighBits() == 0)
9412 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: Src);
9413
9414 SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32);
9415 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi);
9416 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
9417 }
9418
9419 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
9420 Src.getValueType() == MVT::i64)
9421 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
9422
9423 // global <-> flat are no-ops and never emitted.
9424
9425 // Invalid casts are poison.
9426 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
9427}
9428
9429// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
9430// the small vector and inserting them into the big vector. That is better than
9431// the default expansion of doing it via a stack slot. Even though the use of
9432// the stack slot would be optimized away afterwards, the stack slot itself
9433// remains.
9434SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
9435 SelectionDAG &DAG) const {
9436 SDValue Vec = Op.getOperand(i: 0);
9437 SDValue Ins = Op.getOperand(i: 1);
9438 SDValue Idx = Op.getOperand(i: 2);
9439 EVT VecVT = Vec.getValueType();
9440 EVT InsVT = Ins.getValueType();
9441 EVT EltVT = VecVT.getVectorElementType();
9442 unsigned InsNumElts = InsVT.getVectorNumElements();
9443 unsigned IdxVal = Idx->getAsZExtVal();
9444 SDLoc SL(Op);
9445
9446 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
9447 // Insert 32-bit registers at a time.
9448 assert(InsNumElts % 2 == 0 && "expect legal vector types");
9449
9450 unsigned VecNumElts = VecVT.getVectorNumElements();
9451 EVT NewVecVT =
9452 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / 2);
9453 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
9454 : EVT::getVectorVT(Context&: *DAG.getContext(),
9455 VT: MVT::i32, NumElements: InsNumElts / 2);
9456
9457 Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
9458 Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
9459
9460 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
9461 SDValue Elt;
9462 if (InsNumElts == 2) {
9463 Elt = Ins;
9464 } else {
9465 Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins,
9466 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
9467 }
9468 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt,
9469 N3: DAG.getConstant(Val: IdxVal / 2 + I, DL: SL, VT: MVT::i32));
9470 }
9471
9472 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
9473 }
9474
9475 for (unsigned I = 0; I != InsNumElts; ++I) {
9476 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins,
9477 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
9478 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt,
9479 N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32));
9480 }
9481 return Vec;
9482}
9483
9484SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
9485 SelectionDAG &DAG) const {
9486 SDValue Vec = Op.getOperand(i: 0);
9487 SDValue InsVal = Op.getOperand(i: 1);
9488 SDValue Idx = Op.getOperand(i: 2);
9489 EVT VecVT = Vec.getValueType();
9490 EVT EltVT = VecVT.getVectorElementType();
9491 unsigned VecSize = VecVT.getSizeInBits();
9492 unsigned EltSize = EltVT.getSizeInBits();
9493 SDLoc SL(Op);
9494
9495 // Specially handle the case of v4i16 with static indexing.
9496 unsigned NumElts = VecVT.getVectorNumElements();
9497 auto *KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
9498 if (NumElts == 4 && EltSize == 16 && KIdx) {
9499 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec);
9500
9501 SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
9502 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
9503 SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
9504 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
9505
9506 SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf);
9507 SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf);
9508
9509 unsigned Idx = KIdx->getZExtValue();
9510 bool InsertLo = Idx < 2;
9511 SDValue InsHalf = DAG.getNode(
9512 Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16, N1: InsertLo ? LoVec : HiVec,
9513 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal),
9514 N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - 2), DL: SL, VT: MVT::i32));
9515
9516 InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf);
9517
9518 SDValue Concat =
9519 InsertLo ? DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {InsHalf, HiHalf})
9520 : DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoHalf, InsHalf});
9521
9522 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
9523 }
9524
9525 // Static indexing does not lower to stack access, and hence there is no need
9526 // for special custom lowering to avoid stack access.
9527 if (isa<ConstantSDNode>(Val: Idx))
9528 return SDValue();
9529
9530 // Avoid stack access for dynamic indexing by custom lowering to
9531 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
9532
9533 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
9534
9535 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
9536
9537 // Convert vector index to bit-index and get the required bit mask.
9538 assert(isPowerOf2_32(EltSize));
9539 const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
9540 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
9541 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
9542 SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
9543 N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
9544
9545 // 1. Create a congruent vector with the target value in each element.
9546 SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
9547 Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
9548
9549 // 2. Mask off all other indices except the required index within (1).
9550 SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
9551
9552 // 3. Mask off the required index within the target vector.
9553 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
9554 SDValue RHS =
9555 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
9556
9557 // 4. Get (2) and (3) ORed into the target vector.
9558 SDValue BFI =
9559 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS, Flags: SDNodeFlags::Disjoint);
9560
9561 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
9562}
9563
9564SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
9565 SelectionDAG &DAG) const {
9566 SDLoc SL(Op);
9567
9568 EVT ResultVT = Op.getValueType();
9569 SDValue Vec = Op.getOperand(i: 0);
9570 SDValue Idx = Op.getOperand(i: 1);
9571 EVT VecVT = Vec.getValueType();
9572 unsigned VecSize = VecVT.getSizeInBits();
9573 EVT EltVT = VecVT.getVectorElementType();
9574
9575 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
9576
9577 // Make sure we do any optimizations that will make it easier to fold
9578 // source modifiers before obscuring it with bit operations.
9579
9580 // XXX - Why doesn't this get called when vector_shuffle is expanded?
9581 if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
9582 return Combined;
9583
9584 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9585 SDValue Lo, Hi;
9586 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT: VecVT);
9587
9588 if (VecSize == 128) {
9589 SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec);
9590 Lo = DAG.getBitcast(VT: LoVT,
9591 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
9592 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)));
9593 Hi = DAG.getBitcast(VT: HiVT,
9594 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
9595 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)));
9596 } else if (VecSize == 256) {
9597 SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec);
9598 SDValue Parts[4];
9599 for (unsigned P = 0; P < 4; ++P) {
9600 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
9601 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
9602 }
9603
9604 Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
9605 N1: Parts[0], N2: Parts[1]));
9606 Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
9607 N1: Parts[2], N2: Parts[3]));
9608 } else {
9609 assert(VecSize == 512);
9610
9611 SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec);
9612 SDValue Parts[8];
9613 for (unsigned P = 0; P < 8; ++P) {
9614 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
9615 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
9616 }
9617
9618 Lo = DAG.getBitcast(VT: LoVT,
9619 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
9620 N1: Parts[0], N2: Parts[1], N3: Parts[2], N4: Parts[3]));
9621 Hi = DAG.getBitcast(VT: HiVT,
9622 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
9623 N1: Parts[4], N2: Parts[5], N3: Parts[6], N4: Parts[7]));
9624 }
9625
9626 EVT IdxVT = Idx.getValueType();
9627 unsigned NElem = VecVT.getVectorNumElements();
9628 assert(isPowerOf2_32(NElem));
9629 SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT);
9630 SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
9631 SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
9632 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
9633 }
9634
9635 assert(VecSize <= 64);
9636
9637 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
9638
9639 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
9640 SDValue VecBC = peekThroughBitcasts(V: Vec);
9641 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
9642 SDValue Src = VecBC.getOperand(i: 0);
9643 Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
9644 Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
9645 }
9646
9647 unsigned EltSize = EltVT.getSizeInBits();
9648 assert(isPowerOf2_32(EltSize));
9649
9650 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
9651
9652 // Convert vector index to bit-index (* EltSize)
9653 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
9654
9655 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
9656 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
9657
9658 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9659 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt);
9660 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
9661 }
9662
9663 return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
9664}
9665
9666static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
9667 assert(Elt % 2 == 0);
9668 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9669}
9670
9671static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9672 assert(Elt % 2 == 0);
9673 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9674 !(Mask[Elt + 1] & 1);
9675}
9676
9677SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9678 SelectionDAG &DAG) const {
9679 SDLoc SL(Op);
9680 EVT ResultVT = Op.getValueType();
9681 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
9682 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9683 const int NewSrcNumElts = 2;
9684 MVT PackVT = MVT::getVectorVT(VT: EltVT, NumElements: NewSrcNumElts);
9685 int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements();
9686
9687 // Break up the shuffle into registers sized pieces.
9688 //
9689 // We're trying to form sub-shuffles that the register allocation pipeline
9690 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9691 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9692 // pair of copies into a consecutive register copy, so use the ordinary
9693 // extract_vector_elt lowering unless we can use the shuffle.
9694 //
9695 // TODO: This is a bit of hack, and we should probably always use
9696 // extract_subvector for the largest possible subvector we can (or at least
9697 // use it for PackVT aligned pieces). However we have worse support for
9698 // combines on them don't directly treat extract_subvector / insert_subvector
9699 // as legal. The DAG scheduler also ends up doing a worse job with the
9700 // extract_subvectors.
9701 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9702
9703 // vector_shuffle <0,1,6,7> lhs, rhs
9704 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9705 //
9706 // vector_shuffle <6,7,2,3> lhs, rhs
9707 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9708 //
9709 // vector_shuffle <6,7,0,1> lhs, rhs
9710 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9711
9712 // Avoid scalarizing when both halves are reading from consecutive elements.
9713
9714 // If we're treating 2 element shuffles as legal, also create odd-to-even
9715 // shuffles of neighboring pairs.
9716 //
9717 // vector_shuffle <3,2,7,6> lhs, rhs
9718 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9719 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9720
9721 SmallVector<SDValue, 16> Pieces;
9722 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9723 if (ShouldUseConsecutiveExtract &&
9724 elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
9725 const int Idx = SVN->getMaskElt(Idx: I);
9726 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9727 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9728 SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT,
9729 N1: SVN->getOperand(Num: VecIdx),
9730 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
9731 Pieces.push_back(Elt: SubVec);
9732 } else if (elementPairIsOddToEven(Mask: SVN->getMask(), Elt: I) &&
9733 isOperationLegal(Op: ISD::VECTOR_SHUFFLE, VT: PackVT)) {
9734 int Idx0 = SVN->getMaskElt(Idx: I);
9735 int Idx1 = SVN->getMaskElt(Idx: I + 1);
9736
9737 SDValue SrcOp0 = SVN->getOperand(Num: 0);
9738 SDValue SrcOp1 = SrcOp0;
9739 if (Idx0 >= SrcNumElts) {
9740 SrcOp0 = SVN->getOperand(Num: 1);
9741 Idx0 -= SrcNumElts;
9742 }
9743
9744 if (Idx1 >= SrcNumElts) {
9745 SrcOp1 = SVN->getOperand(Num: 1);
9746 Idx1 -= SrcNumElts;
9747 }
9748
9749 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9750 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9751
9752 // Extract nearest even aligned piece.
9753 SDValue SubVec0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp0,
9754 N2: DAG.getConstant(Val: AlignedIdx0, DL: SL, VT: MVT::i32));
9755 SDValue SubVec1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp1,
9756 N2: DAG.getConstant(Val: AlignedIdx1, DL: SL, VT: MVT::i32));
9757
9758 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9759 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9760
9761 SDValue Result0 = SubVec0;
9762 SDValue Result1 = SubVec0;
9763
9764 if (SubVec0 != SubVec1) {
9765 NewMaskIdx1 += NewSrcNumElts;
9766 Result1 = SubVec1;
9767 } else {
9768 Result1 = DAG.getPOISON(VT: PackVT);
9769 }
9770
9771 SDValue Shuf = DAG.getVectorShuffle(VT: PackVT, dl: SL, N1: Result0, N2: Result1,
9772 Mask: {NewMaskIdx0, NewMaskIdx1});
9773 Pieces.push_back(Elt: Shuf);
9774 } else {
9775 const int Idx0 = SVN->getMaskElt(Idx: I);
9776 const int Idx1 = SVN->getMaskElt(Idx: I + 1);
9777 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9778 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9779 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9780 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9781
9782 SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
9783 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec0,
9784 N2: DAG.getSignedConstant(Val: EltIdx0, DL: SL, VT: MVT::i32));
9785
9786 SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
9787 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec1,
9788 N2: DAG.getSignedConstant(Val: EltIdx1, DL: SL, VT: MVT::i32));
9789 Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: {Elt0, Elt1}));
9790 }
9791 }
9792
9793 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
9794}
9795
9796SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9797 SelectionDAG &DAG) const {
9798 SDValue SVal = Op.getOperand(i: 0);
9799 EVT ResultVT = Op.getValueType();
9800 EVT SValVT = SVal.getValueType();
9801 SDValue UndefVal = DAG.getPOISON(VT: SValVT);
9802 SDLoc SL(Op);
9803
9804 SmallVector<SDValue, 8> VElts;
9805 VElts.push_back(Elt: SVal);
9806 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9807 VElts.push_back(Elt: UndefVal);
9808
9809 return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
9810}
9811
9812SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9813 SelectionDAG &DAG) const {
9814 SDLoc SL(Op);
9815 EVT VT = Op.getValueType();
9816
9817 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9818 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9819
9820 SDValue Lo = Op.getOperand(i: 0);
9821 SDValue Hi = Op.getOperand(i: 1);
9822
9823 // Avoid adding defined bits with the zero_extend.
9824 if (Hi.isUndef()) {
9825 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9826 SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9827 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
9828 }
9829
9830 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi);
9831 Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi);
9832
9833 SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi,
9834 N2: DAG.getConstant(Val: 16, DL: SL, VT: MVT::i32));
9835 if (Lo.isUndef())
9836 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
9837
9838 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9839 Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9840
9841 SDValue Or =
9842 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi, Flags: SDNodeFlags::Disjoint);
9843 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
9844 }
9845
9846 // Split into 2-element chunks.
9847 const unsigned NumParts = VT.getVectorNumElements() / 2;
9848 EVT PartVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), NumElements: 2);
9849 MVT PartIntVT = MVT::getIntegerVT(BitWidth: PartVT.getSizeInBits());
9850
9851 SmallVector<SDValue> Casts;
9852 for (unsigned P = 0; P < NumParts; ++P) {
9853 SDValue Vec = DAG.getBuildVector(
9854 VT: PartVT, DL: SL, Ops: {Op.getOperand(i: P * 2), Op.getOperand(i: P * 2 + 1)});
9855 Casts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: PartIntVT, Operand: Vec));
9856 }
9857
9858 SDValue Blend =
9859 DAG.getBuildVector(VT: MVT::getVectorVT(VT: PartIntVT, NumElements: NumParts), DL: SL, Ops: Casts);
9860 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
9861}
9862
9863bool SITargetLowering::isOffsetFoldingLegal(
9864 const GlobalAddressSDNode *GA) const {
9865 // OSes that use ELF REL relocations (instead of RELA) can only store a
9866 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9867 // which can create arbitrary 64-bit addends. (This is only a problem for
9868 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9869 // the high 32 bits of the addend.)
9870 //
9871 // This should be kept in sync with how HasRelocationAddend is initialized in
9872 // the constructor of ELFAMDGPUAsmBackend.
9873 if (!Subtarget->isAmdHsaOS())
9874 return false;
9875
9876 // We can fold offsets for anything that doesn't require a GOT relocation.
9877 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9878 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
9879 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
9880 !shouldEmitGOTReloc(GV: GA->getGlobal());
9881}
9882
9883static SDValue
9884buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
9885 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9886 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9887 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9888 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9889 // lowered to the following code sequence:
9890 //
9891 // For constant address space:
9892 // s_getpc_b64 s[0:1]
9893 // s_add_u32 s0, s0, $symbol
9894 // s_addc_u32 s1, s1, 0
9895 //
9896 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9897 // a fixup or relocation is emitted to replace $symbol with a literal
9898 // constant, which is a pc-relative offset from the encoding of the $symbol
9899 // operand to the global variable.
9900 //
9901 // For global address space:
9902 // s_getpc_b64 s[0:1]
9903 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9904 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9905 //
9906 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9907 // fixups or relocations are emitted to replace $symbol@*@lo and
9908 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9909 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9910 // operand to the global variable.
9911 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9912 assert(GAFlags != SIInstrInfo::MO_NONE);
9913
9914 SDValue Ptr =
9915 DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset, TargetFlags: GAFlags + 2);
9916 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET64, DL, VT: PtrVT, Operand: Ptr);
9917 }
9918
9919 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags);
9920 SDValue PtrHi;
9921 if (GAFlags == SIInstrInfo::MO_NONE)
9922 PtrHi = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
9923 else
9924 PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + 1);
9925 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
9926}
9927
9928SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
9929 SDValue Op,
9930 SelectionDAG &DAG) const {
9931 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
9932 SDLoc DL(GSD);
9933 EVT PtrVT = Op.getValueType();
9934
9935 const GlobalValue *GV = GSD->getGlobal();
9936 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9937 shouldUseLDSConstAddress(GV)) ||
9938 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
9939 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
9940 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9941 GV->hasExternalLinkage()) {
9942 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
9943 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9944 // zero-sized type in other languages to declare the dynamic shared
9945 // memory which size is not known at the compile time. They will be
9946 // allocated by the runtime and placed directly after the static
9947 // allocated ones. They all share the same offset.
9948 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
9949 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9950 // Adjust alignment for that dynamic shared memory array.
9951 Function &F = DAG.getMachineFunction().getFunction();
9952 MFI->setDynLDSAlign(F, GV: GVar);
9953 MFI->setUsesDynamicLDS(true);
9954 return SDValue(
9955 DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), 0);
9956 }
9957 }
9958 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
9959 }
9960
9961 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
9962 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(),
9963 TargetFlags: SIInstrInfo::MO_ABS32_LO);
9964 return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA);
9965 }
9966
9967 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9968 if (Subtarget->has64BitLiterals()) {
9969 SDValue Addr = DAG.getTargetGlobalAddress(
9970 GV, DL, VT: MVT::i64, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS64);
9971 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64, Op1: Addr),
9972 0);
9973 }
9974
9975 SDValue AddrLo = DAG.getTargetGlobalAddress(
9976 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO);
9977 AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), 0};
9978
9979 SDValue AddrHi = DAG.getTargetGlobalAddress(
9980 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI);
9981 AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), 0};
9982
9983 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi);
9984 }
9985
9986 if (shouldEmitFixup(GV))
9987 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
9988
9989 if (shouldEmitPCReloc(GV))
9990 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
9991 GAFlags: SIInstrInfo::MO_REL32);
9992
9993 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT,
9994 GAFlags: SIInstrInfo::MO_GOTPCREL32);
9995 PointerType *PtrTy =
9996 PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
9997 const DataLayout &DataLayout = DAG.getDataLayout();
9998 Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
9999 MachinePointerInfo PtrInfo =
10000 MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
10001
10002 return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
10003 MMOFlags: MachineMemOperand::MODereferenceable |
10004 MachineMemOperand::MOInvariant);
10005}
10006
10007SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
10008 SelectionDAG &DAG) const {
10009 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
10010 const Function &Fn = DAG.getMachineFunction().getFunction();
10011 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10012 Fn, "unsupported external symbol", Op.getDebugLoc()));
10013 return DAG.getPOISON(VT: Op.getValueType());
10014}
10015
10016SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
10017 const SDLoc &DL, SDValue V) const {
10018 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
10019 // the destination register.
10020 //
10021 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
10022 // so we will end up with redundant moves to m0.
10023 //
10024 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
10025
10026 // A Null SDValue creates a glue result.
10027 SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue,
10028 Op1: V, Op2: Chain);
10029 return SDValue(M0, 0);
10030}
10031
10032SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
10033 MVT VT,
10034 unsigned Offset) const {
10035 SDLoc SL(Op);
10036 SDValue Param = lowerKernargMemParameter(
10037 DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align(4), Signed: false);
10038 // The local size values will have the hi 16-bits as zero.
10039 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param,
10040 N2: DAG.getValueType(VT));
10041}
10042
10043static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
10044 EVT VT) {
10045 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10046 DAG.getMachineFunction().getFunction(),
10047 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
10048 return DAG.getPOISON(VT);
10049}
10050
10051static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
10052 EVT VT) {
10053 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10054 DAG.getMachineFunction().getFunction(),
10055 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10056 return DAG.getPOISON(VT);
10057}
10058
10059static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
10060 ArrayRef<SDValue> Elts) {
10061 assert(!Elts.empty());
10062 MVT Type;
10063 unsigned NumElts = Elts.size();
10064
10065 if (NumElts <= 12) {
10066 Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
10067 } else {
10068 assert(Elts.size() <= 16);
10069 Type = MVT::v16f32;
10070 NumElts = 16;
10071 }
10072
10073 SmallVector<SDValue, 16> VecElts(NumElts);
10074 for (unsigned i = 0; i < Elts.size(); ++i) {
10075 SDValue Elt = Elts[i];
10076 if (Elt.getValueType() != MVT::f32)
10077 Elt = DAG.getBitcast(VT: MVT::f32, V: Elt);
10078 VecElts[i] = Elt;
10079 }
10080 for (unsigned i = Elts.size(); i < NumElts; ++i)
10081 VecElts[i] = DAG.getPOISON(VT: MVT::f32);
10082
10083 if (NumElts == 1)
10084 return VecElts[0];
10085 return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
10086}
10087
10088static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
10089 SDValue Src, int ExtraElts) {
10090 EVT SrcVT = Src.getValueType();
10091
10092 SmallVector<SDValue, 8> Elts;
10093
10094 if (SrcVT.isVector())
10095 DAG.ExtractVectorElements(Op: Src, Args&: Elts);
10096 else
10097 Elts.push_back(Elt: Src);
10098
10099 SDValue Undef = DAG.getPOISON(VT: SrcVT.getScalarType());
10100 while (ExtraElts--)
10101 Elts.push_back(Elt: Undef);
10102
10103 return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
10104}
10105
10106// Re-construct the required return value for a image load intrinsic.
10107// This is more complicated due to the optional use TexFailCtrl which means the
10108// required return type is an aggregate
10109static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
10110 ArrayRef<EVT> ResultTypes, bool IsTexFail,
10111 bool Unpacked, bool IsD16, int DMaskPop,
10112 int NumVDataDwords, bool IsAtomicPacked16Bit,
10113 const SDLoc &DL) {
10114 // Determine the required return type. This is the same regardless of
10115 // IsTexFail flag
10116 EVT ReqRetVT = ResultTypes[0];
10117 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
10118 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
10119 ? (ReqRetNumElts + 1) / 2
10120 : ReqRetNumElts;
10121
10122 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
10123
10124 MVT DataDwordVT =
10125 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords);
10126
10127 MVT MaskPopVT =
10128 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords);
10129
10130 SDValue Data(Result, 0);
10131 SDValue TexFail;
10132
10133 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
10134 SDValue ZeroIdx = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
10135 if (MaskPopVT.isVector()) {
10136 Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
10137 N1: SDValue(Result, 0), N2: ZeroIdx);
10138 } else {
10139 Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
10140 N1: SDValue(Result, 0), N2: ZeroIdx);
10141 }
10142 }
10143
10144 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
10145 Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
10146 ExtraElts: NumDataDwords - MaskPopDwords);
10147
10148 if (IsD16)
10149 Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
10150
10151 EVT LegalReqRetVT = ReqRetVT;
10152 if (!ReqRetVT.isVector()) {
10153 if (!Data.getValueType().isInteger())
10154 Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
10155 VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
10156 Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
10157 } else {
10158 // We need to widen the return vector to a legal type
10159 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
10160 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
10161 LegalReqRetVT =
10162 EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
10163 NumElements: ReqRetVT.getVectorNumElements() + 1);
10164 }
10165 }
10166 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
10167
10168 if (IsTexFail) {
10169 TexFail =
10170 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue(Result, 0),
10171 N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32));
10172
10173 return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL);
10174 }
10175
10176 if (Result->getNumValues() == 1)
10177 return Data;
10178
10179 return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL);
10180}
10181
10182static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
10183 SDValue *LWE, bool &IsTexFail) {
10184 auto *TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
10185
10186 uint64_t Value = TexFailCtrlConst->getZExtValue();
10187 if (Value) {
10188 IsTexFail = true;
10189 }
10190
10191 SDLoc DL(TexFailCtrlConst);
10192 *TFE = DAG.getTargetConstant(Val: (Value & 0x1) ? 1 : 0, DL, VT: MVT::i32);
10193 Value &= ~(uint64_t)0x1;
10194 *LWE = DAG.getTargetConstant(Val: (Value & 0x2) ? 1 : 0, DL, VT: MVT::i32);
10195 Value &= ~(uint64_t)0x2;
10196
10197 return Value == 0;
10198}
10199
10200static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
10201 MVT PackVectorVT,
10202 SmallVectorImpl<SDValue> &PackedAddrs,
10203 unsigned DimIdx, unsigned EndIdx,
10204 unsigned NumGradients) {
10205 SDLoc DL(Op);
10206 for (unsigned I = DimIdx; I < EndIdx; I++) {
10207 SDValue Addr = Op.getOperand(i: I);
10208
10209 // Gradients are packed with undef for each coordinate.
10210 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
10211 // 1D: undef,dx/dh; undef,dx/dv
10212 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
10213 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
10214 if (((I + 1) >= EndIdx) ||
10215 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
10216 I == DimIdx + NumGradients - 1))) {
10217 if (Addr.getValueType() != MVT::i16)
10218 Addr = DAG.getBitcast(VT: MVT::i16, V: Addr);
10219 Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr);
10220 } else {
10221 Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)});
10222 I++;
10223 }
10224 Addr = DAG.getBitcast(VT: MVT::f32, V: Addr);
10225 PackedAddrs.push_back(Elt: Addr);
10226 }
10227}
10228
10229SDValue SITargetLowering::lowerImage(SDValue Op,
10230 const AMDGPU::ImageDimIntrinsicInfo *Intr,
10231 SelectionDAG &DAG, bool WithChain) const {
10232 SDLoc DL(Op);
10233 MachineFunction &MF = DAG.getMachineFunction();
10234 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
10235 unsigned IntrOpcode = Intr->BaseOpcode;
10236 // For image atomic: use no-return opcode if result is unused.
10237 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
10238 !Op.getNode()->hasAnyUseOfValue(Value: 0))
10239 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
10240 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
10241 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: IntrOpcode);
10242 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
10243 bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget);
10244 bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
10245 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
10246 bool IsGFX13 = AMDGPU::isGFX13(STI: *Subtarget);
10247
10248 SmallVector<EVT, 3> ResultTypes(Op->values());
10249 SmallVector<EVT, 3> OrigResultTypes(Op->values());
10250 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
10251 ResultTypes.erase(CI: &ResultTypes[0]);
10252
10253 bool IsD16 = false;
10254 bool IsG16 = false;
10255 bool IsA16 = false;
10256 SDValue VData;
10257 int NumVDataDwords = 0;
10258 bool AdjustRetType = false;
10259 bool IsAtomicPacked16Bit = false;
10260
10261 // Offset of intrinsic arguments
10262 const unsigned ArgOffset = WithChain ? 2 : 1;
10263
10264 unsigned DMask;
10265 unsigned DMaskLanes = 0;
10266
10267 if (BaseOpcode->Atomic) {
10268 VData = Op.getOperand(i: 2);
10269
10270 IsAtomicPacked16Bit =
10271 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
10272 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
10273 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
10274 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
10275
10276 bool Is64Bit = VData.getValueSizeInBits() == 64;
10277 if (BaseOpcode->AtomicX2) {
10278 SDValue VData2 = Op.getOperand(i: 3);
10279 VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
10280 Ops: {VData, VData2});
10281 if (Is64Bit)
10282 VData = DAG.getBitcast(VT: MVT::v4i32, V: VData);
10283
10284 if (!BaseOpcode->NoReturn)
10285 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
10286
10287 DMask = Is64Bit ? 0xf : 0x3;
10288 NumVDataDwords = Is64Bit ? 4 : 2;
10289 } else {
10290 DMask = Is64Bit ? 0x3 : 0x1;
10291 NumVDataDwords = Is64Bit ? 2 : 1;
10292 }
10293 } else {
10294 DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
10295 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
10296
10297 if (BaseOpcode->Store) {
10298 VData = Op.getOperand(i: 2);
10299
10300 MVT StoreVT = VData.getSimpleValueType();
10301 if (StoreVT.getScalarType() == MVT::f16) {
10302 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
10303 return Op; // D16 is unsupported for this instruction
10304
10305 IsD16 = true;
10306 VData = handleD16VData(VData, DAG, ImageStore: true);
10307 }
10308
10309 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
10310 } else if (!BaseOpcode->NoReturn) {
10311 // Work out the num dwords based on the dmask popcount and underlying type
10312 // and whether packing is supported.
10313 MVT LoadVT = ResultTypes[0].getSimpleVT();
10314 if (LoadVT.getScalarType() == MVT::f16) {
10315 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
10316 return Op; // D16 is unsupported for this instruction
10317
10318 IsD16 = true;
10319 }
10320
10321 // Confirm that the return type is large enough for the dmask specified
10322 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
10323 (!LoadVT.isVector() && DMaskLanes > 1))
10324 return Op;
10325
10326 // The sq block of gfx8 and gfx9 do not estimate register use correctly
10327 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
10328 // instructions.
10329 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
10330 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
10331 NumVDataDwords = (DMaskLanes + 1) / 2;
10332 else
10333 NumVDataDwords = DMaskLanes;
10334
10335 AdjustRetType = true;
10336 }
10337 }
10338
10339 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
10340 SmallVector<SDValue, 4> VAddrs;
10341
10342 // Check for 16 bit addresses or derivatives and pack if true.
10343 MVT VAddrVT =
10344 Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
10345 MVT VAddrScalarVT = VAddrVT.getScalarType();
10346 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10347 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10348
10349 VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
10350 VAddrScalarVT = VAddrVT.getScalarType();
10351 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10352 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10353
10354 // Push back extra arguments.
10355 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
10356 if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) {
10357 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
10358 // Special handling of bias when A16 is on. Bias is of type half but
10359 // occupies full 32-bit.
10360 SDValue Bias = DAG.getBuildVector(
10361 VT: MVT::v2f16, DL,
10362 Ops: {Op.getOperand(i: ArgOffset + I), DAG.getPOISON(VT: MVT::f16)});
10363 VAddrs.push_back(Elt: Bias);
10364 } else {
10365 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
10366 "Bias needs to be converted to 16 bit in A16 mode");
10367 VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
10368 }
10369 }
10370
10371 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
10372 // 16 bit gradients are supported, but are tied to the A16 control
10373 // so both gradients and addresses must be 16 bit
10374 LLVM_DEBUG(
10375 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
10376 "require 16 bit args for both gradients and addresses");
10377 return Op;
10378 }
10379
10380 if (IsA16) {
10381 if (!ST->hasA16()) {
10382 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
10383 "support 16 bit addresses\n");
10384 return Op;
10385 }
10386 }
10387
10388 // We've dealt with incorrect input so we know that if IsA16, IsG16
10389 // are set then we have to compress/pack operands (either address,
10390 // gradient or both)
10391 // In the case where a16 and gradients are tied (no G16 support) then we
10392 // have already verified that both IsA16 and IsG16 are true
10393 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
10394 // Activate g16
10395 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
10396 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
10397 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
10398 }
10399
10400 // Add gradients (packed or unpacked)
10401 if (IsG16) {
10402 // Pack the gradients
10403 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
10404 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
10405 DimIdx: ArgOffset + Intr->GradientStart,
10406 EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
10407 } else {
10408 for (unsigned I = ArgOffset + Intr->GradientStart;
10409 I < ArgOffset + Intr->CoordStart; I++)
10410 VAddrs.push_back(Elt: Op.getOperand(i: I));
10411 }
10412
10413 // Add addresses (packed or unpacked)
10414 if (IsA16) {
10415 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
10416 DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
10417 NumGradients: 0 /* No gradients */);
10418 } else {
10419 // Add uncompressed address
10420 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
10421 VAddrs.push_back(Elt: Op.getOperand(i: I));
10422 }
10423
10424 // If the register allocator cannot place the address registers contiguously
10425 // without introducing moves, then using the non-sequential address encoding
10426 // is always preferable, since it saves VALU instructions and is usually a
10427 // wash in terms of code size or even better.
10428 //
10429 // However, we currently have no way of hinting to the register allocator that
10430 // MIMG addresses should be placed contiguously when it is possible to do so,
10431 // so force non-NSA for the common 2-address case as a heuristic.
10432 //
10433 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
10434 // allocation when possible.
10435 //
10436 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
10437 // set of the remaining addresses.
10438 const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
10439 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
10440 const bool UseNSA = ST->hasNSAEncoding() &&
10441 VAddrs.size() >= ST->getNSAThreshold(MF) &&
10442 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
10443 const bool UsePartialNSA =
10444 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
10445
10446 SDValue VAddr;
10447 if (UsePartialNSA) {
10448 VAddr = getBuildDwordsVector(DAG, DL,
10449 Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1));
10450 } else if (!UseNSA) {
10451 VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
10452 }
10453
10454 SDValue True = DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1);
10455 SDValue False = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1);
10456 SDValue Unorm;
10457 if (!BaseOpcode->Sampler) {
10458 Unorm = True;
10459 } else {
10460 uint64_t UnormConst =
10461 Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
10462
10463 Unorm = UnormConst ? True : False;
10464 }
10465
10466 SDValue TFE;
10467 SDValue LWE;
10468 SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
10469 bool IsTexFail = false;
10470 if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
10471 return Op;
10472
10473 if (IsTexFail) {
10474 if (!DMaskLanes) {
10475 // Expecting to get an error flag since TFC is on - and dmask is 0
10476 // Force dmask to be at least 1 otherwise the instruction will fail
10477 DMask = 0x1;
10478 DMaskLanes = 1;
10479 NumVDataDwords = 1;
10480 }
10481 NumVDataDwords += 1;
10482 AdjustRetType = true;
10483 }
10484
10485 // Has something earlier tagged that the return type needs adjusting
10486 // This happens if the instruction is a load or has set TexFailCtrl flags
10487 if (AdjustRetType) {
10488 // NumVDataDwords reflects the true number of dwords required in the return
10489 // type
10490 if (DMaskLanes == 0 && !BaseOpcode->Store) {
10491 // This is a no-op load. This can be eliminated
10492 SDValue Undef = DAG.getPOISON(VT: Op.getValueType());
10493 if (isa<MemSDNode>(Val: Op))
10494 return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL);
10495 return Undef;
10496 }
10497
10498 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(Context&: *DAG.getContext(),
10499 VT: MVT::i32, NumElements: NumVDataDwords)
10500 : MVT::i32;
10501
10502 ResultTypes[0] = NewVT;
10503 if (ResultTypes.size() == 3) {
10504 // Original result was aggregate type used for TexFailCtrl results
10505 // The actual instruction returns as a vector type which has now been
10506 // created. Remove the aggregate result.
10507 ResultTypes.erase(CI: &ResultTypes[1]);
10508 }
10509 }
10510
10511 unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
10512 // Keep GLC only when the atomic's result is actually used.
10513 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
10514 CPol |= AMDGPU::CPol::GLC;
10515 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
10516 AMDGPU::CPol::VOLATILE))
10517 return Op;
10518
10519 SmallVector<SDValue, 26> Ops;
10520 if (BaseOpcode->Store || BaseOpcode->Atomic)
10521 Ops.push_back(Elt: VData); // vdata
10522 if (UsePartialNSA) {
10523 append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1));
10524 Ops.push_back(Elt: VAddr);
10525 } else if (UseNSA)
10526 append_range(C&: Ops, R&: VAddrs);
10527 else
10528 Ops.push_back(Elt: VAddr);
10529 SDValue Rsrc = Op.getOperand(i: ArgOffset + Intr->RsrcIndex);
10530 EVT RsrcVT = Rsrc.getValueType();
10531 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10532 return Op;
10533 Ops.push_back(Elt: Rsrc);
10534 if (BaseOpcode->Sampler) {
10535 SDValue Samp = Op.getOperand(i: ArgOffset + Intr->SampIndex);
10536 if (Samp.getValueType() != MVT::v4i32)
10537 return Op;
10538 Ops.push_back(Elt: Samp);
10539 }
10540 Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32));
10541 if (IsGFX10Plus)
10542 Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32));
10543 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10544 Ops.push_back(Elt: Unorm);
10545 Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32));
10546 Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9
10547 ST->hasFeature(Feature: AMDGPU::FeatureR128A16)
10548 ? True
10549 : False);
10550 if (IsGFX10Plus)
10551 Ops.push_back(Elt: IsA16 ? True : False);
10552
10553 if (!Subtarget->hasGFX90AInsts())
10554 Ops.push_back(Elt: TFE); // tfe
10555 else if (TFE->getAsZExtVal()) {
10556 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10557 DAG.getMachineFunction().getFunction(),
10558 "TFE is not supported on this GPU", DL.getDebugLoc()));
10559 }
10560
10561 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10562 Ops.push_back(Elt: LWE); // lwe
10563 if (!IsGFX10Plus)
10564 Ops.push_back(Elt: DimInfo->DA ? True : False);
10565 if (BaseOpcode->HasD16)
10566 Ops.push_back(Elt: IsD16 ? True : False);
10567 if (isa<MemSDNode>(Val: Op))
10568 Ops.push_back(Elt: Op.getOperand(i: 0)); // chain
10569
10570 int NumVAddrDwords =
10571 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
10572 int Opcode = -1;
10573
10574 if (IsGFX13) {
10575 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx13,
10576 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
10577 } else if (IsGFX12Plus) {
10578 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
10579 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
10580 } else if (IsGFX11Plus) {
10581 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
10582 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
10583 : AMDGPU::MIMGEncGfx11Default,
10584 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
10585 } else if (IsGFX10Plus) {
10586 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
10587 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
10588 : AMDGPU::MIMGEncGfx10Default,
10589 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
10590 } else {
10591 if (Subtarget->hasGFX90AInsts()) {
10592 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
10593 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
10594 if (Opcode == -1) {
10595 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10596 DAG.getMachineFunction().getFunction(),
10597 "requested image instruction is not supported on this GPU",
10598 DL.getDebugLoc()));
10599
10600 unsigned Idx = 0;
10601 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
10602 for (EVT VT : OrigResultTypes) {
10603 if (VT == MVT::Other)
10604 RetValues[Idx++] = Op.getOperand(i: 0); // Chain
10605 else
10606 RetValues[Idx++] = DAG.getPOISON(VT);
10607 }
10608
10609 return DAG.getMergeValues(Ops: RetValues, dl: DL);
10610 }
10611 }
10612 if (Opcode == -1 &&
10613 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10614 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
10615 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
10616 if (Opcode == -1)
10617 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
10618 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
10619 }
10620 if (Opcode == -1)
10621 return Op;
10622
10623 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
10624 if (auto *MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
10625 MachineMemOperand *MemRef = MemOp->getMemOperand();
10626 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
10627 }
10628
10629 if (BaseOpcode->NoReturn) {
10630 if (BaseOpcode->Atomic)
10631 return DAG.getMergeValues(
10632 Ops: {DAG.getPOISON(VT: OrigResultTypes[0]), SDValue(NewNode, 0)}, dl: DL);
10633
10634 return SDValue(NewNode, 0);
10635 }
10636
10637 if (BaseOpcode->AtomicX2) {
10638 SmallVector<SDValue, 1> Elt;
10639 DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1);
10640 return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL);
10641 }
10642
10643 return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
10644 Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
10645 NumVDataDwords, IsAtomicPacked16Bit, DL);
10646}
10647
10648SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
10649 SDValue Offset, SDValue CachePolicy,
10650 SelectionDAG &DAG) const {
10651 MachineFunction &MF = DAG.getMachineFunction();
10652
10653 const DataLayout &DataLayout = DAG.getDataLayout();
10654 Align Alignment =
10655 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
10656
10657 MachineMemOperand *MMO = MF.getMachineMemOperand(
10658 PtrInfo: MachinePointerInfo(),
10659 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
10660 MachineMemOperand::MOInvariant,
10661 Size: VT.getStoreSize(), BaseAlignment: Alignment);
10662
10663 if (!Offset->isDivergent()) {
10664 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
10665
10666 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
10667 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
10668 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
10669 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
10670 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10671 SDValue BufferLoad =
10672 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL,
10673 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
10674 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
10675 }
10676
10677 // Widen vec3 load to vec4.
10678 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10679 !Subtarget->hasScalarDwordx3Loads()) {
10680 EVT WidenedVT =
10681 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
10682 auto WidenedOp = DAG.getMemIntrinsicNode(
10683 Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
10684 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize()));
10685 auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
10686 N2: DAG.getVectorIdxConstant(Val: 0, DL));
10687 return Subvector;
10688 }
10689
10690 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
10691 VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
10692 }
10693
10694 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10695 // assume that the buffer is unswizzled.
10696 SDValue Ops[] = {
10697 DAG.getEntryNode(), // Chain
10698 Rsrc, // rsrc
10699 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10700 {}, // voffset
10701 {}, // soffset
10702 {}, // offset
10703 CachePolicy, // cachepolicy
10704 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10705 };
10706 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10707 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
10708 return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
10709 }
10710
10711 SmallVector<SDValue, 4> Loads;
10712 unsigned NumLoads = 1;
10713 MVT LoadVT = VT.getSimpleVT();
10714 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10715 assert((LoadVT.getScalarType() == MVT::i32 ||
10716 LoadVT.getScalarType() == MVT::f32));
10717
10718 if (NumElts == 8 || NumElts == 16) {
10719 NumLoads = NumElts / 4;
10720 LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4);
10721 }
10722
10723 SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Other});
10724
10725 // Use the alignment to ensure that the required offsets will fit into the
10726 // immediate offsets.
10727 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3],
10728 Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10729
10730 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10731 unsigned LoadSize = LoadVT.getStoreSize();
10732 for (unsigned i = 0; i < NumLoads; ++i) {
10733 Ops[5] = DAG.getTargetConstant(Val: InstOffset + 16 * i, DL, VT: MVT::i32);
10734 MachineMemOperand *LoadMMO = MF.getMachineMemOperand(MMO, Offset: 16 * i, Size: LoadSize);
10735 Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10736 MemVT: LoadVT, MMO: LoadMMO, DAG));
10737 }
10738
10739 if (NumElts == 8 || NumElts == 16)
10740 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
10741
10742 return Loads[0];
10743}
10744
10745SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10746 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10747 if (!Subtarget->hasArchitectedSGPRs())
10748 return {};
10749 SDLoc SL(Op);
10750 MVT VT = MVT::i32;
10751 SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT);
10752 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
10753 N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT));
10754}
10755
10756SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10757 AMDGPU::Hwreg::Id HwReg,
10758 unsigned LowBit,
10759 unsigned Width) const {
10760 SDLoc SL(Op);
10761 using namespace AMDGPU::Hwreg;
10762 return {DAG.getMachineNode(
10763 Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT: MVT::i32,
10764 Op1: DAG.getTargetConstant(Val: HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width),
10765 DL: SL, VT: MVT::i32)),
10766 0};
10767}
10768
10769SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10770 unsigned Dim,
10771 const ArgDescriptor &Arg) const {
10772 SDLoc SL(Op);
10773 MachineFunction &MF = DAG.getMachineFunction();
10774 unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim);
10775 if (MaxID == 0)
10776 return DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
10777
10778 // It's undefined behavior if a function marked with the amdgpu-no-*
10779 // attributes uses the corresponding intrinsic.
10780 if (!Arg)
10781 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
10782
10783 SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32,
10784 SL: SDLoc(DAG.getEntryNode()), Arg);
10785
10786 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10787 // masking operations anyway.
10788 //
10789 // TODO: We could assert the top bit is 0 for the source copy.
10790 if (Arg.isMasked())
10791 return Val;
10792
10793 // Preserve the known bits after expansion to a copy.
10794 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
10795 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val,
10796 N2: DAG.getValueType(SmallVT));
10797}
10798
10799SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10800 SelectionDAG &DAG) const {
10801 MachineFunction &MF = DAG.getMachineFunction();
10802 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10803
10804 EVT VT = Op.getValueType();
10805 SDLoc DL(Op);
10806 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
10807
10808 // TODO: Should this propagate fast-math-flags?
10809
10810 switch (IntrinsicID) {
10811 case Intrinsic::amdgcn_wave_reduce_min:
10812 case Intrinsic::amdgcn_wave_reduce_umin:
10813 case Intrinsic::amdgcn_wave_reduce_max:
10814 case Intrinsic::amdgcn_wave_reduce_umax:
10815 case Intrinsic::amdgcn_wave_reduce_add:
10816 case Intrinsic::amdgcn_wave_reduce_sub:
10817 case Intrinsic::amdgcn_wave_reduce_and:
10818 case Intrinsic::amdgcn_wave_reduce_or:
10819 case Intrinsic::amdgcn_wave_reduce_xor: {
10820 EVT SrcVT = Op.getOperand(i: 1).getValueType();
10821 if (SrcVT == MVT::i16) {
10822 bool NeedsSignExt = IntrinsicID == Intrinsic::amdgcn_wave_reduce_min ||
10823 IntrinsicID == Intrinsic::amdgcn_wave_reduce_max ||
10824 IntrinsicID == Intrinsic::amdgcn_wave_reduce_add ||
10825 IntrinsicID == Intrinsic::amdgcn_wave_reduce_sub;
10826 unsigned ExtOpc = NeedsSignExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
10827 SDValue ExtendedSrc = DAG.getNode(Opcode: ExtOpc, DL, VT: MVT::i32, Operand: Op.getOperand(i: 1));
10828 SDValue Strategy = Op.getOperand(i: 2);
10829 SDValue Result = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i32,
10830 N1: Op.getOperand(i: 0), N2: ExtendedSrc, N3: Strategy);
10831 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Result);
10832 }
10833 return SDValue();
10834 }
10835 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10836 if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction()))
10837 return emitNonHSAIntrinsicError(DAG, DL, VT);
10838 return getPreloadedValue(DAG, MFI: *MFI, VT,
10839 PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
10840 }
10841 case Intrinsic::amdgcn_dispatch_ptr:
10842 case Intrinsic::amdgcn_queue_ptr: {
10843 if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) {
10844 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10845 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10846 DL.getDebugLoc()));
10847 return DAG.getPOISON(VT);
10848 }
10849
10850 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10851 ? AMDGPUFunctionArgInfo::DISPATCH_PTR
10852 : AMDGPUFunctionArgInfo::QUEUE_PTR;
10853 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
10854 }
10855 case Intrinsic::amdgcn_implicitarg_ptr: {
10856 if (MFI->isEntryFunction())
10857 return getImplicitArgPtr(DAG, SL: DL);
10858 return getPreloadedValue(DAG, MFI: *MFI, VT,
10859 PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
10860 }
10861 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10862 if (!AMDGPU::isKernel(F: MF.getFunction())) {
10863 // This only makes sense to call in a kernel, so just lower to null.
10864 return DAG.getConstant(Val: 0, DL, VT);
10865 }
10866
10867 return getPreloadedValue(DAG, MFI: *MFI, VT,
10868 PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
10869 }
10870 case Intrinsic::amdgcn_dispatch_id: {
10871 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
10872 }
10873 case Intrinsic::amdgcn_rcp:
10874 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1));
10875 case Intrinsic::amdgcn_rsq:
10876 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
10877 case Intrinsic::amdgcn_rsq_legacy:
10878 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10879 return emitRemovedIntrinsicError(DAG, DL, VT);
10880 return SDValue();
10881 case Intrinsic::amdgcn_rcp_legacy:
10882 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10883 return emitRemovedIntrinsicError(DAG, DL, VT);
10884 return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1));
10885 case Intrinsic::amdgcn_fma_legacy:
10886 if (!Subtarget->hasFmaLegacy32Insts())
10887 return emitRemovedIntrinsicError(DAG, DL, VT);
10888 return SDValue();
10889 case Intrinsic::amdgcn_sudot4:
10890 case Intrinsic::amdgcn_sudot8:
10891 if (!Subtarget->hasDot8Insts())
10892 return emitRemovedIntrinsicError(DAG, DL, VT);
10893 return SDValue();
10894 case Intrinsic::amdgcn_tanh:
10895 if (!Subtarget->hasTanhInsts())
10896 return emitRemovedIntrinsicError(DAG, DL, VT);
10897 return SDValue();
10898 case Intrinsic::amdgcn_rsq_clamp: {
10899 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10900 return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1));
10901
10902 Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext());
10903 APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
10904 APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
10905
10906 SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
10907 SDValue Tmp =
10908 DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, N2: DAG.getConstantFP(Val: Max, DL, VT));
10909 return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
10910 N2: DAG.getConstantFP(Val: Min, DL, VT));
10911 }
10912 case Intrinsic::r600_read_ngroups_x:
10913 if (Subtarget->isAmdHsaOS())
10914 return emitNonHSAIntrinsicError(DAG, DL, VT);
10915
10916 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10917 Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4),
10918 Signed: false);
10919 case Intrinsic::r600_read_ngroups_y:
10920 if (Subtarget->isAmdHsaOS())
10921 return emitNonHSAIntrinsicError(DAG, DL, VT);
10922
10923 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10924 Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4),
10925 Signed: false);
10926 case Intrinsic::r600_read_ngroups_z:
10927 if (Subtarget->isAmdHsaOS())
10928 return emitNonHSAIntrinsicError(DAG, DL, VT);
10929
10930 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10931 Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4),
10932 Signed: false);
10933 case Intrinsic::r600_read_local_size_x:
10934 if (Subtarget->isAmdHsaOS())
10935 return emitNonHSAIntrinsicError(DAG, DL, VT);
10936
10937 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10938 Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
10939 case Intrinsic::r600_read_local_size_y:
10940 if (Subtarget->isAmdHsaOS())
10941 return emitNonHSAIntrinsicError(DAG, DL, VT);
10942
10943 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10944 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
10945 case Intrinsic::r600_read_local_size_z:
10946 if (Subtarget->isAmdHsaOS())
10947 return emitNonHSAIntrinsicError(DAG, DL, VT);
10948
10949 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10950 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
10951 case Intrinsic::amdgcn_workgroup_id_x:
10952 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10953 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
10954 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
10955 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
10956 case Intrinsic::amdgcn_workgroup_id_y:
10957 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10958 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
10959 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
10960 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
10961 case Intrinsic::amdgcn_workgroup_id_z:
10962 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10963 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
10964 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
10965 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
10966 case Intrinsic::amdgcn_cluster_id_x:
10967 return Subtarget->hasClusters()
10968 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10969 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X)
10970 : DAG.getPOISON(VT);
10971 case Intrinsic::amdgcn_cluster_id_y:
10972 return Subtarget->hasClusters()
10973 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10974 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y)
10975 : DAG.getPOISON(VT);
10976 case Intrinsic::amdgcn_cluster_id_z:
10977 return Subtarget->hasClusters()
10978 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10979 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z)
10980 : DAG.getPOISON(VT);
10981 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10982 return Subtarget->hasClusters()
10983 ? getPreloadedValue(
10984 DAG, MFI: *MFI, VT,
10985 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X)
10986 : DAG.getPOISON(VT);
10987 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10988 return Subtarget->hasClusters()
10989 ? getPreloadedValue(
10990 DAG, MFI: *MFI, VT,
10991 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y)
10992 : DAG.getPOISON(VT);
10993 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10994 return Subtarget->hasClusters()
10995 ? getPreloadedValue(
10996 DAG, MFI: *MFI, VT,
10997 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z)
10998 : DAG.getPOISON(VT);
10999 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
11000 return Subtarget->hasClusters()
11001 ? lowerConstHwRegRead(DAG, Op, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4)
11002 : SDValue();
11003 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
11004 return Subtarget->hasClusters()
11005 ? getPreloadedValue(
11006 DAG, MFI: *MFI, VT,
11007 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X)
11008 : DAG.getPOISON(VT);
11009 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
11010 return Subtarget->hasClusters()
11011 ? getPreloadedValue(
11012 DAG, MFI: *MFI, VT,
11013 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y)
11014 : DAG.getPOISON(VT);
11015 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
11016 return Subtarget->hasClusters()
11017 ? getPreloadedValue(
11018 DAG, MFI: *MFI, VT,
11019 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z)
11020 : DAG.getPOISON(VT);
11021 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
11022 return Subtarget->hasClusters()
11023 ? getPreloadedValue(
11024 DAG, MFI: *MFI, VT,
11025 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID)
11026 : DAG.getPOISON(VT);
11027 case Intrinsic::amdgcn_wave_id:
11028 return lowerWaveID(DAG, Op);
11029 case Intrinsic::amdgcn_lds_kernel_id: {
11030 if (MFI->isEntryFunction())
11031 return getLDSKernelId(DAG, SL: DL);
11032 return getPreloadedValue(DAG, MFI: *MFI, VT,
11033 PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
11034 }
11035 case Intrinsic::amdgcn_workitem_id_x:
11036 return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX);
11037 case Intrinsic::amdgcn_workitem_id_y:
11038 return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY);
11039 case Intrinsic::amdgcn_workitem_id_z:
11040 return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ);
11041 case Intrinsic::amdgcn_wavefrontsize:
11042 return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
11043 DL: SDLoc(Op), VT: MVT::i32);
11044 case Intrinsic::amdgcn_s_buffer_load: {
11045 unsigned CPol = Op.getConstantOperandVal(i: 3);
11046 // s_buffer_load, because of how it's optimized, can't be volatile
11047 // so reject ones with the volatile bit set.
11048 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
11049 ? AMDGPU::CPol::ALL
11050 : AMDGPU::CPol::ALL_pregfx12))
11051 return Op;
11052 return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2),
11053 CachePolicy: Op.getOperand(i: 3), DAG);
11054 }
11055 case Intrinsic::amdgcn_fdiv_fast:
11056 return lowerFDIV_FAST(Op, DAG);
11057 case Intrinsic::amdgcn_sin:
11058 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1));
11059
11060 case Intrinsic::amdgcn_cos:
11061 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1));
11062
11063 case Intrinsic::amdgcn_mul_u24:
11064 return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1),
11065 N2: Op.getOperand(i: 2));
11066 case Intrinsic::amdgcn_mul_i24:
11067 return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1),
11068 N2: Op.getOperand(i: 2));
11069
11070 case Intrinsic::amdgcn_log_clamp: {
11071 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
11072 return SDValue();
11073
11074 return emitRemovedIntrinsicError(DAG, DL, VT);
11075 }
11076 case Intrinsic::amdgcn_fract:
11077 return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1));
11078
11079 case Intrinsic::amdgcn_class:
11080 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, N1: Op.getOperand(i: 1),
11081 N2: Op.getOperand(i: 2));
11082 case Intrinsic::amdgcn_div_fmas:
11083 return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, N1: Op.getOperand(i: 1),
11084 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
11085
11086 case Intrinsic::amdgcn_div_fixup:
11087 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, N1: Op.getOperand(i: 1),
11088 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
11089
11090 case Intrinsic::amdgcn_div_scale: {
11091 const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
11092
11093 // Translate to the operands expected by the machine instruction. The
11094 // first parameter must be the same as the first instruction.
11095 SDValue Numerator = Op.getOperand(i: 1);
11096 SDValue Denominator = Op.getOperand(i: 2);
11097
11098 // Note this order is opposite of the machine instruction's operations,
11099 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
11100 // intrinsic has the numerator as the first operand to match a normal
11101 // division operation.
11102
11103 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
11104
11105 return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0,
11106 N2: Denominator, N3: Numerator);
11107 }
11108 case Intrinsic::amdgcn_icmp: {
11109 // There is a Pat that handles this variant, so return it as-is.
11110 if (Op.getOperand(i: 1).getValueType() == MVT::i1 &&
11111 Op.getConstantOperandVal(i: 2) == 0 &&
11112 Op.getConstantOperandVal(i: 3) == ICmpInst::Predicate::ICMP_NE)
11113 return Op;
11114 return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
11115 }
11116 case Intrinsic::amdgcn_fcmp: {
11117 return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
11118 }
11119 case Intrinsic::amdgcn_ballot:
11120 return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
11121 case Intrinsic::amdgcn_fmed3:
11122 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, N1: Op.getOperand(i: 1),
11123 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), Flags: Op->getFlags());
11124 case Intrinsic::amdgcn_fdot2:
11125 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, N1: Op.getOperand(i: 1),
11126 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
11127 case Intrinsic::amdgcn_fmul_legacy:
11128 return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, N1: Op.getOperand(i: 1),
11129 N2: Op.getOperand(i: 2));
11130 case Intrinsic::amdgcn_sbfe:
11131 return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, N1: Op.getOperand(i: 1),
11132 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
11133 case Intrinsic::amdgcn_ubfe:
11134 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, N1: Op.getOperand(i: 1),
11135 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
11136 case Intrinsic::amdgcn_cvt_pkrtz:
11137 case Intrinsic::amdgcn_cvt_pknorm_i16:
11138 case Intrinsic::amdgcn_cvt_pknorm_u16:
11139 case Intrinsic::amdgcn_cvt_pk_i16:
11140 case Intrinsic::amdgcn_cvt_pk_u16: {
11141 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
11142 EVT VT = Op.getValueType();
11143 unsigned Opcode;
11144
11145 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
11146 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
11147 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
11148 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
11149 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
11150 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
11151 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
11152 Opcode = AMDGPUISD::CVT_PK_I16_I32;
11153 else
11154 Opcode = AMDGPUISD::CVT_PK_U16_U32;
11155
11156 if (isTypeLegal(VT))
11157 return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
11158
11159 SDValue Node =
11160 DAG.getNode(Opcode, DL, VT: MVT::i32, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
11161 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
11162 }
11163 case Intrinsic::amdgcn_fmad_ftz:
11164 return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1),
11165 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
11166
11167 case Intrinsic::amdgcn_if_break:
11168 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT,
11169 Op1: Op->getOperand(Num: 1), Op2: Op->getOperand(Num: 2)),
11170 0);
11171
11172 case Intrinsic::amdgcn_groupstaticsize: {
11173 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
11174 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
11175 return Op;
11176
11177 const Module *M = MF.getFunction().getParent();
11178 const GlobalValue *GV =
11179 Intrinsic::getDeclarationIfExists(M, id: Intrinsic::amdgcn_groupstaticsize);
11180 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: 0,
11181 TargetFlags: SIInstrInfo::MO_ABS32_LO);
11182 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
11183 }
11184 case Intrinsic::amdgcn_is_shared:
11185 case Intrinsic::amdgcn_is_private: {
11186 SDLoc SL(Op);
11187 SDValue SrcVec =
11188 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
11189 SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec,
11190 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
11191
11192 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
11193 ? AMDGPUAS::LOCAL_ADDRESS
11194 : AMDGPUAS::PRIVATE_ADDRESS;
11195 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
11196 Subtarget->hasGloballyAddressableScratch()) {
11197 SDValue FlatScratchBaseHi(
11198 DAG.getMachineNode(
11199 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
11200 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, VT: MVT::i32)),
11201 0);
11202 // Test bits 63..58 against the aperture address.
11203 return DAG.getSetCC(
11204 DL: SL, VT: MVT::i1,
11205 LHS: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: SrcHi, N2: FlatScratchBaseHi),
11206 RHS: DAG.getConstant(Val: 1u << 26, DL: SL, VT: MVT::i32), Cond: ISD::SETULT);
11207 }
11208
11209 SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
11210 return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ);
11211 }
11212 case Intrinsic::amdgcn_perm:
11213 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: 1),
11214 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
11215 case Intrinsic::amdgcn_reloc_constant: {
11216 Module *M = MF.getFunction().getParent();
11217 const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD();
11218 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
11219 auto *RelocSymbol = cast<GlobalVariable>(
11220 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
11221 SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: 0,
11222 TargetFlags: SIInstrInfo::MO_ABS32_LO);
11223 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
11224 }
11225 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
11226 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
11227 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
11228 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
11229 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
11230 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
11231 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
11232 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
11233 if (Op.getOperand(i: 4).getValueType() == MVT::i32)
11234 return SDValue();
11235
11236 SDLoc SL(Op);
11237 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i32);
11238 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
11239 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2),
11240 N4: Op.getOperand(i: 3), N5: IndexKeyi32);
11241 }
11242 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
11243 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
11244 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
11245 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
11246 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
11247 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
11248 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
11249 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
11250 if (Op.getOperand(i: 4).getValueType() == MVT::i64)
11251 return SDValue();
11252
11253 SDLoc SL(Op);
11254 auto IndexKeyi64 =
11255 Op.getOperand(i: 4).getValueType() == MVT::v2i32
11256 ? DAG.getBitcast(VT: MVT::i64, V: Op.getOperand(i: 4))
11257 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i64);
11258 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
11259 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
11260 Op.getOperand(i: 3), IndexKeyi64, Op.getOperand(i: 5),
11261 Op.getOperand(i: 6)});
11262 }
11263 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
11264 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
11265 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
11266 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
11267 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
11268 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
11269 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
11270 ? MVT::i64
11271 : MVT::i32;
11272 if (Op.getOperand(i: 6).getValueType() == IndexKeyTy)
11273 return SDValue();
11274
11275 SDLoc SL(Op);
11276 auto IndexKey =
11277 Op.getOperand(i: 6).getValueType().isVector()
11278 ? DAG.getBitcast(VT: IndexKeyTy, V: Op.getOperand(i: 6))
11279 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: IndexKeyTy);
11280 SmallVector<SDValue> Args{
11281 Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
11282 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
11283 IndexKey, Op.getOperand(i: 7), Op.getOperand(i: 8)};
11284 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
11285 Args.push_back(Elt: Op.getOperand(i: 9));
11286 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(), Ops: Args);
11287 }
11288 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
11289 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
11290 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
11291 if (Op.getOperand(i: 6).getValueType() == MVT::i32)
11292 return SDValue();
11293
11294 SDLoc SL(Op);
11295 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: MVT::i32);
11296 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
11297 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
11298 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
11299 IndexKeyi32, Op.getOperand(i: 7)});
11300 }
11301 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
11302 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
11303 unsigned AFmt = (unsigned)Op.getConstantOperandVal(i: 1);
11304 unsigned BFmt = (unsigned)Op.getConstantOperandVal(i: 3);
11305 unsigned AScaleFmt = (unsigned)Op.getConstantOperandVal(i: 8);
11306 unsigned BScaleFmt = (unsigned)Op.getConstantOperandVal(i: 11);
11307 if (!AMDGPU::isValidWMMAScaleFmtCombination(AFmt, AScale: AScaleFmt, BFmt,
11308 BScale: BScaleFmt)) {
11309 DAG.getMachineFunction().getFunction().getContext().emitError(
11310 ErrorStr: "invalid matrix and scale format combination in wmma call");
11311 Op->print(OS&: errs());
11312 errs() << '\n';
11313 }
11314 return SDValue();
11315 }
11316 case Intrinsic::amdgcn_addrspacecast_nonnull:
11317 return lowerADDRSPACECAST(Op, DAG);
11318 case Intrinsic::amdgcn_readlane:
11319 case Intrinsic::amdgcn_readfirstlane:
11320 case Intrinsic::amdgcn_writelane:
11321 case Intrinsic::amdgcn_permlane16:
11322 case Intrinsic::amdgcn_permlanex16:
11323 case Intrinsic::amdgcn_permlane64:
11324 case Intrinsic::amdgcn_set_inactive:
11325 case Intrinsic::amdgcn_set_inactive_chain_arg:
11326 case Intrinsic::amdgcn_mov_dpp8:
11327 case Intrinsic::amdgcn_update_dpp:
11328 case Intrinsic::amdgcn_permlane_bcast:
11329 case Intrinsic::amdgcn_permlane_up:
11330 case Intrinsic::amdgcn_permlane_down:
11331 case Intrinsic::amdgcn_permlane_xor:
11332 return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG);
11333 case Intrinsic::amdgcn_dead: {
11334 SmallVector<SDValue, 8> Poisons;
11335 for (const EVT ValTy : Op.getNode()->values())
11336 Poisons.push_back(Elt: DAG.getPOISON(VT: ValTy));
11337 return DAG.getMergeValues(Ops: Poisons, dl: SDLoc(Op));
11338 }
11339 case Intrinsic::amdgcn_wave_shuffle:
11340 return lowerWaveShuffle(TLI: *this, N: Op.getNode(), DAG);
11341 default:
11342 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11343 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
11344 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
11345
11346 return Op;
11347 }
11348}
11349
11350// On targets not supporting constant in soffset field, turn zero to
11351// SGPR_NULL to avoid generating an extra s_mov with zero.
11352static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
11353 const GCNSubtarget *Subtarget) {
11354 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset))
11355 return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
11356 return SOffset;
11357}
11358
11359SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
11360 SelectionDAG &DAG,
11361 unsigned NewOpcode) const {
11362 SDLoc DL(Op);
11363
11364 SDValue VData = Op.getOperand(i: 2);
11365 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11366 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11367 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11368 SDValue Ops[] = {
11369 Op.getOperand(i: 0), // Chain
11370 VData, // vdata
11371 Rsrc, // rsrc
11372 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11373 VOffset, // voffset
11374 SOffset, // soffset
11375 Offset, // offset
11376 Op.getOperand(i: 6), // cachepolicy
11377 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11378 };
11379
11380 auto *M = cast<MemSDNode>(Val&: Op);
11381
11382 EVT MemVT = VData.getValueType();
11383 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
11384 MMO: M->getMemOperand());
11385}
11386
11387SDValue
11388SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
11389 unsigned NewOpcode) const {
11390 SDLoc DL(Op);
11391
11392 SDValue VData = Op.getOperand(i: 2);
11393 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11394 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
11395 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
11396 SDValue Ops[] = {
11397 Op.getOperand(i: 0), // Chain
11398 VData, // vdata
11399 Rsrc, // rsrc
11400 Op.getOperand(i: 4), // vindex
11401 VOffset, // voffset
11402 SOffset, // soffset
11403 Offset, // offset
11404 Op.getOperand(i: 7), // cachepolicy
11405 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11406 };
11407
11408 auto *M = cast<MemSDNode>(Val&: Op);
11409
11410 EVT MemVT = VData.getValueType();
11411 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
11412 MMO: M->getMemOperand());
11413}
11414
11415SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
11416 SelectionDAG &DAG) const {
11417 unsigned IntrID = Op.getConstantOperandVal(i: 1);
11418 SDLoc DL(Op);
11419
11420 switch (IntrID) {
11421 case Intrinsic::amdgcn_ds_ordered_add:
11422 case Intrinsic::amdgcn_ds_ordered_swap: {
11423 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11424 SDValue Chain = M->getOperand(Num: 0);
11425 SDValue M0 = M->getOperand(Num: 2);
11426 SDValue Value = M->getOperand(Num: 3);
11427 unsigned IndexOperand = M->getConstantOperandVal(Num: 7);
11428 unsigned WaveRelease = M->getConstantOperandVal(Num: 8);
11429 unsigned WaveDone = M->getConstantOperandVal(Num: 9);
11430
11431 unsigned OrderedCountIndex = IndexOperand & 0x3f;
11432 IndexOperand &= ~0x3f;
11433 unsigned CountDw = 0;
11434
11435 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
11436 CountDw = (IndexOperand >> 24) & 0xf;
11437 IndexOperand &= ~(0xf << 24);
11438
11439 if (CountDw < 1 || CountDw > 4) {
11440 const Function &Fn = DAG.getMachineFunction().getFunction();
11441 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
11442 Fn, "ds_ordered_count: dword count must be between 1 and 4",
11443 DL.getDebugLoc()));
11444 CountDw = 1;
11445 }
11446 }
11447
11448 if (IndexOperand) {
11449 const Function &Fn = DAG.getMachineFunction().getFunction();
11450 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
11451 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
11452 }
11453
11454 if (WaveDone && !WaveRelease) {
11455 // TODO: Move this to IR verifier
11456 const Function &Fn = DAG.getMachineFunction().getFunction();
11457 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
11458 Fn, "ds_ordered_count: wave_done requires wave_release",
11459 DL.getDebugLoc()));
11460 }
11461
11462 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
11463 unsigned ShaderType =
11464 SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
11465 unsigned Offset0 = OrderedCountIndex << 2;
11466 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
11467
11468 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
11469 Offset1 |= (CountDw - 1) << 6;
11470
11471 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
11472 Offset1 |= ShaderType << 2;
11473
11474 unsigned Offset = Offset0 | (Offset1 << 8);
11475
11476 SDValue Ops[] = {
11477 Chain, Value, DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16),
11478 copyToM0(DAG, Chain, DL, V: M0).getValue(R: 1), // Glue
11479 };
11480 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL,
11481 VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
11482 MMO: M->getMemOperand());
11483 }
11484 case Intrinsic::amdgcn_raw_buffer_load:
11485 case Intrinsic::amdgcn_raw_ptr_buffer_load:
11486 case Intrinsic::amdgcn_raw_atomic_buffer_load:
11487 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
11488 case Intrinsic::amdgcn_raw_buffer_load_format:
11489 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
11490 const bool IsFormat =
11491 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
11492 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
11493
11494 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
11495 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
11496 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
11497 SDValue Ops[] = {
11498 Op.getOperand(i: 0), // Chain
11499 Rsrc, // rsrc
11500 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11501 VOffset, // voffset
11502 SOffset, // soffset
11503 Offset, // offset
11504 Op.getOperand(i: 5), // cachepolicy, swizzled buffer
11505 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11506 };
11507
11508 auto *M = cast<MemSDNode>(Val&: Op);
11509 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
11510 }
11511 case Intrinsic::amdgcn_struct_buffer_load:
11512 case Intrinsic::amdgcn_struct_ptr_buffer_load:
11513 case Intrinsic::amdgcn_struct_buffer_load_format:
11514 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
11515 case Intrinsic::amdgcn_struct_atomic_buffer_load:
11516 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
11517 const bool IsFormat =
11518 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
11519 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
11520
11521 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
11522 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11523 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11524 SDValue Ops[] = {
11525 Op.getOperand(i: 0), // Chain
11526 Rsrc, // rsrc
11527 Op.getOperand(i: 3), // vindex
11528 VOffset, // voffset
11529 SOffset, // soffset
11530 Offset, // offset
11531 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
11532 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11533 };
11534
11535 return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
11536 }
11537 case Intrinsic::amdgcn_raw_tbuffer_load:
11538 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11539 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11540 EVT LoadVT = Op.getValueType();
11541 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
11542 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
11543 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
11544
11545 SDValue Ops[] = {
11546 Op.getOperand(i: 0), // Chain
11547 Rsrc, // rsrc
11548 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11549 VOffset, // voffset
11550 SOffset, // soffset
11551 Offset, // offset
11552 Op.getOperand(i: 5), // format
11553 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
11554 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11555 };
11556
11557 if (LoadVT.getScalarType() == MVT::f16)
11558 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11559 Ops);
11560 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11561 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
11562 DAG);
11563 }
11564 case Intrinsic::amdgcn_struct_tbuffer_load:
11565 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11566 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11567 EVT LoadVT = Op.getValueType();
11568 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
11569 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11570 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11571
11572 SDValue Ops[] = {
11573 Op.getOperand(i: 0), // Chain
11574 Rsrc, // rsrc
11575 Op.getOperand(i: 3), // vindex
11576 VOffset, // voffset
11577 SOffset, // soffset
11578 Offset, // offset
11579 Op.getOperand(i: 6), // format
11580 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
11581 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11582 };
11583
11584 if (LoadVT.getScalarType() == MVT::f16)
11585 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11586 Ops);
11587 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11588 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
11589 DAG);
11590 }
11591 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11592 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11593 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
11594 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11595 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11596 return lowerStructBufferAtomicIntrin(Op, DAG,
11597 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
11598 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11599 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11600 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
11601 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11602 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11603 return lowerStructBufferAtomicIntrin(Op, DAG,
11604 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
11605 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11606 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11607 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
11608 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11610 return lowerStructBufferAtomicIntrin(Op, DAG,
11611 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
11612 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11613 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11614 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
11615 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11616 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11617 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
11618 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11619 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11620 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
11621 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11623 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
11624 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11625 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11626 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
11627 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11628 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11629 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
11630 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11631 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11632 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
11633 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11634 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11635 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
11636 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11638 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
11639 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11640 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11641 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
11642 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11644 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
11645 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11646 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11647 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
11648 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11649 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11650 return lowerStructBufferAtomicIntrin(Op, DAG,
11651 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
11652 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11653 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11654 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
11655 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11656 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11657 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
11658 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11659 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11660 return lowerStructBufferAtomicIntrin(Op, DAG,
11661 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
11662 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11663 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11664 return lowerStructBufferAtomicIntrin(Op, DAG,
11665 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
11666 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11667 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11668 return lowerStructBufferAtomicIntrin(Op, DAG,
11669 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
11670 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11671 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11672 return lowerStructBufferAtomicIntrin(Op, DAG,
11673 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
11674 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11675 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11676 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
11677 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11678 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11679 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
11680 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11681 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11682 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
11683 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11684 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11685 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
11686 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11687 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11688 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
11689 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11690 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11691 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
11692 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11693 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11694 return lowerStructBufferAtomicIntrin(Op, DAG,
11695 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
11696 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11697 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11698 return lowerRawBufferAtomicIntrin(Op, DAG,
11699 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11700 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11701 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11702 return lowerStructBufferAtomicIntrin(Op, DAG,
11703 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11704 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11705 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11706 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG);
11707 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
11708 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
11709 SDValue Ops[] = {
11710 Op.getOperand(i: 0), // Chain
11711 Op.getOperand(i: 2), // src
11712 Op.getOperand(i: 3), // cmp
11713 Rsrc, // rsrc
11714 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11715 VOffset, // voffset
11716 SOffset, // soffset
11717 Offset, // offset
11718 Op.getOperand(i: 7), // cachepolicy
11719 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11720 };
11721 EVT VT = Op.getValueType();
11722 auto *M = cast<MemSDNode>(Val&: Op);
11723
11724 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
11725 VTList: Op->getVTList(), Ops, MemVT: VT,
11726 MMO: M->getMemOperand());
11727 }
11728 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11729 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11730 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG);
11731 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG);
11732 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget);
11733 SDValue Ops[] = {
11734 Op.getOperand(i: 0), // Chain
11735 Op.getOperand(i: 2), // src
11736 Op.getOperand(i: 3), // cmp
11737 Rsrc, // rsrc
11738 Op.getOperand(i: 5), // vindex
11739 VOffset, // voffset
11740 SOffset, // soffset
11741 Offset, // offset
11742 Op.getOperand(i: 8), // cachepolicy
11743 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11744 };
11745 EVT VT = Op.getValueType();
11746 auto *M = cast<MemSDNode>(Val&: Op);
11747
11748 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
11749 VTList: Op->getVTList(), Ops, MemVT: VT,
11750 MMO: M->getMemOperand());
11751 }
11752 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11753 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11754 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11755 SDValue NodePtr = M->getOperand(Num: 2);
11756 SDValue RayExtent = M->getOperand(Num: 3);
11757 SDValue InstanceMask = M->getOperand(Num: 4);
11758 SDValue RayOrigin = M->getOperand(Num: 5);
11759 SDValue RayDir = M->getOperand(Num: 6);
11760 SDValue Offsets = M->getOperand(Num: 7);
11761 SDValue TDescr = M->getOperand(Num: 8);
11762
11763 assert(NodePtr.getValueType() == MVT::i64);
11764 assert(RayDir.getValueType() == MVT::v3f32);
11765
11766 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11767 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11768 return SDValue();
11769 }
11770
11771 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11772 const unsigned NumVDataDwords = 10;
11773 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11774 int Opcode = AMDGPU::getMIMGOpcode(
11775 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11776 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11777 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11778 assert(Opcode != -1);
11779
11780 SmallVector<SDValue, 7> Ops;
11781 Ops.push_back(Elt: NodePtr);
11782 Ops.push_back(Elt: DAG.getBuildVector(
11783 VT: MVT::v2i32, DL,
11784 Ops: {DAG.getBitcast(VT: MVT::i32, V: RayExtent),
11785 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: InstanceMask)}));
11786 Ops.push_back(Elt: RayOrigin);
11787 Ops.push_back(Elt: RayDir);
11788 Ops.push_back(Elt: Offsets);
11789 Ops.push_back(Elt: TDescr);
11790 Ops.push_back(Elt: M->getChain());
11791
11792 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11793 MachineMemOperand *MemRef = M->getMemOperand();
11794 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11795 return SDValue(NewNode, 0);
11796 }
11797 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11798 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11799 SDValue NodePtr = M->getOperand(Num: 2);
11800 SDValue RayExtent = M->getOperand(Num: 3);
11801 SDValue RayOrigin = M->getOperand(Num: 4);
11802 SDValue RayDir = M->getOperand(Num: 5);
11803 SDValue RayInvDir = M->getOperand(Num: 6);
11804 SDValue TDescr = M->getOperand(Num: 7);
11805
11806 assert(NodePtr.getValueType() == MVT::i32 ||
11807 NodePtr.getValueType() == MVT::i64);
11808 assert(RayDir.getValueType() == MVT::v3f16 ||
11809 RayDir.getValueType() == MVT::v3f32);
11810
11811 if (!Subtarget->hasGFX10_AEncoding()) {
11812 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11813 return SDValue();
11814 }
11815
11816 const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget);
11817 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
11818 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11819 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11820 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11821 const unsigned NumVDataDwords = 4;
11822 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11823 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11824 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11825 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11826 IsGFX12Plus;
11827 const unsigned BaseOpcodes[2][2] = {
11828 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11829 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11830 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11831 int Opcode;
11832 if (UseNSA) {
11833 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11834 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11835 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11836 : AMDGPU::MIMGEncGfx10NSA,
11837 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11838 } else {
11839 assert(!IsGFX12Plus);
11840 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11841 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11842 : AMDGPU::MIMGEncGfx10Default,
11843 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11844 }
11845 assert(Opcode != -1);
11846
11847 SmallVector<SDValue, 16> Ops;
11848
11849 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11850 SmallVector<SDValue, 3> Lanes;
11851 DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3);
11852 if (Lanes[0].getValueSizeInBits() == 32) {
11853 for (unsigned I = 0; I < 3; ++I)
11854 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes[I]));
11855 } else {
11856 if (IsAligned) {
11857 Ops.push_back(Elt: DAG.getBitcast(
11858 VT: MVT::i32,
11859 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[0], Lanes[1]})));
11860 Ops.push_back(Elt: Lanes[2]);
11861 } else {
11862 SDValue Elt0 = Ops.pop_back_val();
11863 Ops.push_back(Elt: DAG.getBitcast(
11864 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Elt0, Lanes[0]})));
11865 Ops.push_back(Elt: DAG.getBitcast(
11866 VT: MVT::i32,
11867 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[1], Lanes[2]})));
11868 }
11869 }
11870 };
11871
11872 if (UseNSA && IsGFX11Plus) {
11873 Ops.push_back(Elt: NodePtr);
11874 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11875 Ops.push_back(Elt: RayOrigin);
11876 if (IsA16) {
11877 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11878 DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3);
11879 DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3);
11880 for (unsigned I = 0; I < 3; ++I) {
11881 MergedLanes.push_back(Elt: DAG.getBitcast(
11882 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL,
11883 Ops: {DirLanes[I], InvDirLanes[I]})));
11884 }
11885 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes));
11886 } else {
11887 Ops.push_back(Elt: RayDir);
11888 Ops.push_back(Elt: RayInvDir);
11889 }
11890 } else {
11891 if (Is64)
11892 DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: 0,
11893 Count: 2);
11894 else
11895 Ops.push_back(Elt: NodePtr);
11896
11897 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11898 packLanes(RayOrigin, true);
11899 packLanes(RayDir, true);
11900 packLanes(RayInvDir, false);
11901 }
11902
11903 if (!UseNSA) {
11904 // Build a single vector containing all the operands so far prepared.
11905 if (NumVAddrDwords > 12) {
11906 SDValue Undef = DAG.getPOISON(VT: MVT::i32);
11907 Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef);
11908 }
11909 assert(Ops.size() >= 8 && Ops.size() <= 12);
11910 SDValue MergedOps =
11911 DAG.getBuildVector(VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops);
11912 Ops.clear();
11913 Ops.push_back(Elt: MergedOps);
11914 }
11915
11916 Ops.push_back(Elt: TDescr);
11917 Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1));
11918 Ops.push_back(Elt: M->getChain());
11919
11920 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11921 MachineMemOperand *MemRef = M->getMemOperand();
11922 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11923 return SDValue(NewNode, 0);
11924 }
11925 case Intrinsic::amdgcn_global_atomic_fmin_num:
11926 case Intrinsic::amdgcn_global_atomic_fmax_num:
11927 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11928 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11929 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11930 SDValue Ops[] = {
11931 M->getOperand(Num: 0), // Chain
11932 M->getOperand(Num: 2), // Ptr
11933 M->getOperand(Num: 3) // Value
11934 };
11935 unsigned Opcode = 0;
11936 switch (IntrID) {
11937 case Intrinsic::amdgcn_global_atomic_fmin_num:
11938 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11939 Opcode = ISD::ATOMIC_LOAD_FMIN;
11940 break;
11941 }
11942 case Intrinsic::amdgcn_global_atomic_fmax_num:
11943 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11944 Opcode = ISD::ATOMIC_LOAD_FMAX;
11945 break;
11946 }
11947 default:
11948 llvm_unreachable("unhandled atomic opcode");
11949 }
11950 return DAG.getAtomic(Opcode, dl: SDLoc(Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(),
11951 Ops, MMO: M->getMemOperand());
11952 }
11953 case Intrinsic::amdgcn_s_alloc_vgpr: {
11954 SDValue NumVGPRs = Op.getOperand(i: 2);
11955 if (!NumVGPRs->isDivergent())
11956 return Op;
11957
11958 SDValue ReadFirstLaneID =
11959 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
11960 NumVGPRs = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i32,
11961 N1: ReadFirstLaneID, N2: NumVGPRs);
11962
11963 return DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, VTList: Op->getVTList(),
11964 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: NumVGPRs);
11965 }
11966 case Intrinsic::amdgcn_s_get_barrier_state:
11967 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11968 SDValue Chain = Op->getOperand(Num: 0);
11969 SmallVector<SDValue, 2> Ops;
11970 unsigned Opc;
11971
11972 if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) {
11973 uint64_t BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getZExtValue();
11974 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11975 BarID = (BarID >> 4) & 0x3F;
11976 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11977 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11978 Ops.push_back(Elt: K);
11979 Ops.push_back(Elt: Chain);
11980 } else {
11981 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11982 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11983 SDValue M0Val;
11984 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Op->getOperand(Num: 2),
11985 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11986 M0Val = SDValue(
11987 DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11988 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11989 0);
11990 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11991 } else
11992 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: Op->getOperand(Num: 2)).getValue(R: 0));
11993 }
11994
11995 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11996 return SDValue(NewMI, 0);
11997 }
11998 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11999 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
12000 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
12001 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
12002 SDValue Chain = Op->getOperand(Num: 0);
12003 SDValue Ptr = Op->getOperand(Num: 2);
12004 EVT VT = Op->getValueType(ResNo: 0);
12005 return DAG.getAtomicLoad(ExtType: ISD::NON_EXTLOAD, dl: DL, MemVT: MII->getMemoryVT(), VT,
12006 Chain, Ptr, MMO: MII->getMemOperand());
12007 }
12008 case Intrinsic::amdgcn_av_load_b128: {
12009 if (!Subtarget->hasFlatGlobalInsts()) {
12010 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
12011 DAG.getMachineFunction().getFunction(),
12012 "llvm.amdgcn.av.load.b128 not supported on subtarget",
12013 DL.getDebugLoc()));
12014 return DAG.getMergeValues(
12015 Ops: {DAG.getPOISON(VT: Op->getValueType(ResNo: 0)), Op->getOperand(Num: 0)}, dl: DL);
12016 }
12017 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
12018 SDValue Chain = Op->getOperand(Num: 0);
12019 SDValue Ptr = Op->getOperand(Num: 2);
12020 EVT VT = Op->getValueType(ResNo: 0);
12021 // Lower to a regular ISD::LOAD. The MachineMemOperand carries Monotonic
12022 // ordering and syncscope so that SIMemoryLegalizer sets cache policy bits.
12023 // Address space filtering in the load_global/load_flat PatFrags selects
12024 // the correct GLOBAL vs FLAT instruction.
12025 return DAG.getLoad(VT, dl: DL, Chain, Ptr, MMO: MII->getMemOperand());
12026 }
12027 case Intrinsic::amdgcn_flat_load_monitor_b32:
12028 case Intrinsic::amdgcn_flat_load_monitor_b64:
12029 case Intrinsic::amdgcn_flat_load_monitor_b128: {
12030 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
12031 SDValue Chain = Op->getOperand(Num: 0);
12032 SDValue Ptr = Op->getOperand(Num: 2);
12033 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::FLAT_LOAD_MONITOR, dl: DL,
12034 VTList: Op->getVTList(), Ops: {Chain, Ptr},
12035 MemVT: MII->getMemoryVT(), MMO: MII->getMemOperand());
12036 }
12037 case Intrinsic::amdgcn_global_load_monitor_b32:
12038 case Intrinsic::amdgcn_global_load_monitor_b64:
12039 case Intrinsic::amdgcn_global_load_monitor_b128: {
12040 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
12041 SDValue Chain = Op->getOperand(Num: 0);
12042 SDValue Ptr = Op->getOperand(Num: 2);
12043 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::GLOBAL_LOAD_MONITOR, dl: DL,
12044 VTList: Op->getVTList(), Ops: {Chain, Ptr},
12045 MemVT: MII->getMemoryVT(), MMO: MII->getMemOperand());
12046 }
12047 default:
12048
12049 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12050 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
12051 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
12052
12053 return SDValue();
12054 }
12055}
12056
12057// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
12058// dwordx4 if on SI and handle TFE loads.
12059SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
12060 SDVTList VTList,
12061 ArrayRef<SDValue> Ops, EVT MemVT,
12062 MachineMemOperand *MMO,
12063 SelectionDAG &DAG) const {
12064 LLVMContext &C = *DAG.getContext();
12065 MachineFunction &MF = DAG.getMachineFunction();
12066 EVT VT = VTList.VTs[0];
12067
12068 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
12069 bool IsTFE = VTList.NumVTs == 3;
12070 if (IsTFE) {
12071 unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32);
12072 unsigned NumOpDWords = NumValueDWords + 1;
12073 EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords);
12074 SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]);
12075 MachineMemOperand *OpDWordsMMO =
12076 MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4);
12077 SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
12078 MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
12079 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12080 N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL));
12081 SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL);
12082 SDValue ValueDWords =
12083 NumValueDWords == 1
12084 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx)
12085 : DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL,
12086 VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op,
12087 N2: ZeroIdx);
12088 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
12089 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
12090 }
12091
12092 if (!Subtarget->hasDwordx3LoadStores() &&
12093 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
12094 EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4);
12095 EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4);
12096 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16);
12097 SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]);
12098 SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
12099 MemVT: WidenedMemVT, MMO: WidenedMMO);
12100 SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
12101 N2: DAG.getVectorIdxConstant(Val: 0, DL));
12102 return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL);
12103 }
12104
12105 return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
12106}
12107
12108SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
12109 bool ImageStore) const {
12110 EVT StoreVT = VData.getValueType();
12111
12112 // No change for f16 and legal vector D16 types.
12113 if (!StoreVT.isVector())
12114 return VData;
12115
12116 SDLoc DL(VData);
12117 unsigned NumElements = StoreVT.getVectorNumElements();
12118
12119 if (Subtarget->hasUnpackedD16VMem()) {
12120 // We need to unpack the packed data to store.
12121 EVT IntStoreVT = StoreVT.changeTypeToInteger();
12122 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
12123
12124 EVT EquivStoreVT =
12125 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements);
12126 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
12127 return DAG.UnrollVectorOp(N: ZExt.getNode());
12128 }
12129
12130 // The sq block of gfx8.1 does not estimate register use correctly for d16
12131 // image store instructions. The data operand is computed as if it were not a
12132 // d16 image instruction.
12133 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
12134 // Bitcast to i16
12135 EVT IntStoreVT = StoreVT.changeTypeToInteger();
12136 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
12137
12138 // Decompose into scalars
12139 SmallVector<SDValue, 4> Elts;
12140 DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
12141
12142 // Group pairs of i16 into v2i16 and bitcast to i32
12143 SmallVector<SDValue, 4> PackedElts;
12144 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
12145 SDValue Pair =
12146 DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts[I * 2], Elts[I * 2 + 1]});
12147 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
12148 PackedElts.push_back(Elt: IntPair);
12149 }
12150 if ((NumElements % 2) == 1) {
12151 // Handle v3i16
12152 unsigned I = Elts.size() / 2;
12153 SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL,
12154 Ops: {Elts[I * 2], DAG.getPOISON(VT: MVT::i16)});
12155 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
12156 PackedElts.push_back(Elt: IntPair);
12157 }
12158
12159 // Pad using UNDEF
12160 PackedElts.resize(N: Elts.size(), NV: DAG.getPOISON(VT: MVT::i32));
12161
12162 // Build final vector
12163 EVT VecVT =
12164 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size());
12165 return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
12166 }
12167
12168 if (NumElements == 3) {
12169 EVT IntStoreVT =
12170 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
12171 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
12172
12173 EVT WidenedStoreVT = EVT::getVectorVT(
12174 Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1);
12175 EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
12176 BitWidth: WidenedStoreVT.getStoreSizeInBits());
12177 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
12178 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
12179 }
12180
12181 assert(isTypeLegal(StoreVT));
12182 return VData;
12183}
12184
12185static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
12186 switch (Intr) {
12187 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12188 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12189 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12190 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
12191 case Intrinsic::amdgcn_load_async_to_lds:
12192 case Intrinsic::amdgcn_global_load_async_lds:
12193 return true;
12194 }
12195 return false;
12196}
12197
12198SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
12199 SelectionDAG &DAG) const {
12200 SDLoc DL(Op);
12201 SDValue Chain = Op.getOperand(i: 0);
12202 unsigned IntrinsicID = Op.getConstantOperandVal(i: 1);
12203
12204 switch (IntrinsicID) {
12205 case Intrinsic::amdgcn_exp_compr: {
12206 if (!Subtarget->hasCompressedExport()) {
12207 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
12208 DAG.getMachineFunction().getFunction(),
12209 "intrinsic not supported on subtarget", DL.getDebugLoc()));
12210 }
12211 SDValue Src0 = Op.getOperand(i: 4);
12212 SDValue Src1 = Op.getOperand(i: 5);
12213 // Hack around illegal type on SI by directly selecting it.
12214 if (isTypeLegal(VT: Src0.getValueType()))
12215 return SDValue();
12216
12217 const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6));
12218 SDValue Undef = DAG.getPOISON(VT: MVT::f32);
12219 const SDValue Ops[] = {
12220 Op.getOperand(i: 2), // tgt
12221 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0
12222 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1
12223 Undef, // src2
12224 Undef, // src3
12225 Op.getOperand(i: 7), // vm
12226 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // compr
12227 Op.getOperand(i: 3), // en
12228 Op.getOperand(i: 0) // Chain
12229 };
12230
12231 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
12232 return SDValue(DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops), 0);
12233 }
12234
12235 case Intrinsic::amdgcn_struct_tbuffer_store:
12236 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
12237 SDValue VData = Op.getOperand(i: 2);
12238 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
12239 if (IsD16)
12240 VData = handleD16VData(VData, DAG);
12241 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
12242 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
12243 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
12244 SDValue Ops[] = {
12245 Chain,
12246 VData, // vdata
12247 Rsrc, // rsrc
12248 Op.getOperand(i: 4), // vindex
12249 VOffset, // voffset
12250 SOffset, // soffset
12251 Offset, // offset
12252 Op.getOperand(i: 7), // format
12253 Op.getOperand(i: 8), // cachepolicy, swizzled buffer
12254 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
12255 };
12256 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12257 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12258 MemSDNode *M = cast<MemSDNode>(Val&: Op);
12259 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
12260 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
12261 }
12262
12263 case Intrinsic::amdgcn_raw_tbuffer_store:
12264 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
12265 SDValue VData = Op.getOperand(i: 2);
12266 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
12267 if (IsD16)
12268 VData = handleD16VData(VData, DAG);
12269 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
12270 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
12271 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
12272 SDValue Ops[] = {
12273 Chain,
12274 VData, // vdata
12275 Rsrc, // rsrc
12276 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
12277 VOffset, // voffset
12278 SOffset, // soffset
12279 Offset, // offset
12280 Op.getOperand(i: 6), // format
12281 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
12282 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
12283 };
12284 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12285 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12286 MemSDNode *M = cast<MemSDNode>(Val&: Op);
12287 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
12288 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
12289 }
12290
12291 case Intrinsic::amdgcn_raw_buffer_store:
12292 case Intrinsic::amdgcn_raw_ptr_buffer_store:
12293 case Intrinsic::amdgcn_raw_buffer_store_format:
12294 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
12295 const bool IsFormat =
12296 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
12297 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
12298
12299 SDValue VData = Op.getOperand(i: 2);
12300 EVT VDataVT = VData.getValueType();
12301 EVT EltType = VDataVT.getScalarType();
12302 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
12303 if (IsD16) {
12304 VData = handleD16VData(VData, DAG);
12305 VDataVT = VData.getValueType();
12306 }
12307
12308 if (!isTypeLegal(VT: VDataVT)) {
12309 VData =
12310 DAG.getNode(Opcode: ISD::BITCAST, DL,
12311 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
12312 }
12313
12314 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
12315 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
12316 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
12317 SDValue Ops[] = {
12318 Chain,
12319 VData,
12320 Rsrc,
12321 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
12322 VOffset, // voffset
12323 SOffset, // soffset
12324 Offset, // offset
12325 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
12326 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
12327 };
12328 unsigned Opc =
12329 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
12330 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
12331 MemSDNode *M = cast<MemSDNode>(Val&: Op);
12332
12333 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
12334 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
12335 return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
12336
12337 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
12338 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
12339 }
12340
12341 case Intrinsic::amdgcn_struct_buffer_store:
12342 case Intrinsic::amdgcn_struct_ptr_buffer_store:
12343 case Intrinsic::amdgcn_struct_buffer_store_format:
12344 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
12345 const bool IsFormat =
12346 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
12347 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
12348
12349 SDValue VData = Op.getOperand(i: 2);
12350 EVT VDataVT = VData.getValueType();
12351 EVT EltType = VDataVT.getScalarType();
12352 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
12353
12354 if (IsD16) {
12355 VData = handleD16VData(VData, DAG);
12356 VDataVT = VData.getValueType();
12357 }
12358
12359 if (!isTypeLegal(VT: VDataVT)) {
12360 VData =
12361 DAG.getNode(Opcode: ISD::BITCAST, DL,
12362 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
12363 }
12364
12365 auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
12366 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
12367 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
12368 SDValue Ops[] = {
12369 Chain,
12370 VData,
12371 Rsrc,
12372 Op.getOperand(i: 4), // vindex
12373 VOffset, // voffset
12374 SOffset, // soffset
12375 Offset, // offset
12376 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
12377 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
12378 };
12379 unsigned Opc =
12380 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
12381 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
12382 MemSDNode *M = cast<MemSDNode>(Val&: Op);
12383
12384 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
12385 EVT VDataType = VData.getValueType().getScalarType();
12386 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
12387 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
12388
12389 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
12390 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
12391 }
12392 case Intrinsic::amdgcn_raw_buffer_load_lds:
12393 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12394 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
12395 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12396 case Intrinsic::amdgcn_struct_buffer_load_lds:
12397 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12398 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
12399 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
12400 if (!Subtarget->hasVMemToLDSLoad())
12401 return SDValue();
12402 unsigned Opc;
12403 bool HasVIndex =
12404 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
12405 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
12406 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
12407 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
12408 unsigned OpOffset = HasVIndex ? 1 : 0;
12409 SDValue VOffset = Op.getOperand(i: 5 + OpOffset);
12410 bool HasVOffset = !isNullConstant(V: VOffset);
12411 unsigned Size = Op->getConstantOperandVal(Num: 4);
12412
12413 switch (Size) {
12414 default:
12415 return SDValue();
12416 case 1:
12417 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
12418 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
12419 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
12420 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
12421 break;
12422 case 2:
12423 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
12424 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
12425 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
12426 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
12427 break;
12428 case 4:
12429 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
12430 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
12431 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
12432 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
12433 break;
12434 case 12:
12435 if (!Subtarget->hasLDSLoadB96_B128())
12436 return SDValue();
12437 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
12438 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
12439 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
12440 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
12441 break;
12442 case 16:
12443 if (!Subtarget->hasLDSLoadB96_B128())
12444 return SDValue();
12445 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
12446 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
12447 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
12448 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
12449 break;
12450 }
12451
12452 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
12453
12454 SmallVector<SDValue, 8> Ops;
12455
12456 if (HasVIndex && HasVOffset)
12457 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL,
12458 Ops: {Op.getOperand(i: 5), // VIndex
12459 VOffset}));
12460 else if (HasVIndex)
12461 Ops.push_back(Elt: Op.getOperand(i: 5));
12462 else if (HasVOffset)
12463 Ops.push_back(Elt: VOffset);
12464
12465 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
12466 Ops.push_back(Elt: Rsrc);
12467 Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset
12468 Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset
12469 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
12470 unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset);
12471 Ops.push_back(Elt: DAG.getTargetConstant(
12472 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
12473 DL, VT: MVT::i8)); // cpol
12474 Ops.push_back(Elt: DAG.getTargetConstant(
12475 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
12476 ? 1
12477 : 0,
12478 DL, VT: MVT::i8)); // swz
12479 Ops.push_back(
12480 Elt: DAG.getTargetConstant(Val: isAsyncLDSDMA(Intr: IntrinsicID), DL, VT: MVT::i8));
12481 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
12482 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
12483
12484 auto *M = cast<MemSDNode>(Val&: Op);
12485 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
12486 DAG.setNodeMemRefs(N: Load, NewMemRefs: M->memoperands());
12487
12488 return SDValue(Load, 0);
12489 }
12490 // Buffers are handled by LowerBufferFatPointers, and we're going to go
12491 // for "trust me" that the remaining cases are global pointers until
12492 // such time as we can put two mem operands on an intrinsic.
12493 case Intrinsic::amdgcn_load_to_lds:
12494 case Intrinsic::amdgcn_load_async_to_lds:
12495 case Intrinsic::amdgcn_global_load_lds:
12496 case Intrinsic::amdgcn_global_load_async_lds: {
12497 if (!Subtarget->hasVMemToLDSLoad())
12498 return SDValue();
12499
12500 unsigned Opc;
12501 unsigned Size = Op->getConstantOperandVal(Num: 4);
12502 switch (Size) {
12503 default:
12504 return SDValue();
12505 case 1:
12506 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
12507 break;
12508 case 2:
12509 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
12510 break;
12511 case 4:
12512 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
12513 break;
12514 case 12:
12515 if (!Subtarget->hasLDSLoadB96_B128())
12516 return SDValue();
12517 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
12518 break;
12519 case 16:
12520 if (!Subtarget->hasLDSLoadB96_B128())
12521 return SDValue();
12522 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
12523 break;
12524 }
12525
12526 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
12527
12528 SmallVector<SDValue, 6> Ops;
12529
12530 SDValue Addr = Op.getOperand(i: 2); // Global ptr
12531 SDValue VOffset;
12532 // Try to split SAddr and VOffset. Global and LDS pointers share the same
12533 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
12534 if (Addr->isDivergent() && Addr->isAnyAdd()) {
12535 SDValue LHS = Addr.getOperand(i: 0);
12536 SDValue RHS = Addr.getOperand(i: 1);
12537
12538 if (LHS->isDivergent())
12539 std::swap(a&: LHS, b&: RHS);
12540
12541 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
12542 RHS.getOperand(i: 0).getValueType() == MVT::i32) {
12543 // add (i64 sgpr), (zero_extend (i32 vgpr))
12544 Addr = LHS;
12545 VOffset = RHS.getOperand(i: 0);
12546 }
12547 }
12548
12549 Ops.push_back(Elt: Addr);
12550 if (!Addr->isDivergent()) {
12551 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
12552 if (!VOffset)
12553 VOffset =
12554 SDValue(DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32,
12555 Op1: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
12556 0);
12557 Ops.push_back(Elt: VOffset);
12558 }
12559
12560 Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset
12561
12562 unsigned Aux = Op.getConstantOperandVal(i: 6);
12563 Ops.push_back(Elt: DAG.getTargetConstant(Val: Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
12564 VT: MVT::i32)); // CPol
12565 Ops.push_back(
12566 Elt: DAG.getTargetConstant(Val: isAsyncLDSDMA(Intr: IntrinsicID), DL, VT: MVT::i8));
12567
12568 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
12569 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
12570
12571 auto *M = cast<MemSDNode>(Val&: Op);
12572 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
12573 DAG.setNodeMemRefs(N: Load, NewMemRefs: M->memoperands());
12574
12575 return SDValue(Load, 0);
12576 }
12577 case Intrinsic::amdgcn_end_cf:
12578 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other,
12579 Op1: Op->getOperand(Num: 2), Op2: Chain),
12580 0);
12581 case Intrinsic::amdgcn_s_barrier_signal_var: {
12582 // Member count of 0 means to re-use a previous member count,
12583 // which, if the named barrier is statically chosen, means we can use
12584 // the immarg form. Otherwisee, fall through to constructiong M0 as for
12585 // s_barrier_init.
12586 SDValue CntOp = Op->getOperand(Num: 3);
12587 auto *CntC = dyn_cast<ConstantSDNode>(Val&: CntOp);
12588 if (CntC && CntC->isZero()) {
12589 SDValue Chain = Op->getOperand(Num: 0);
12590 SDValue BarOp = Op->getOperand(Num: 2);
12591 SmallVector<SDValue, 2> Ops;
12592
12593 std::optional<uint64_t> BarVal;
12594 if (auto *C = dyn_cast<ConstantSDNode>(Val&: BarOp))
12595 BarVal = C->getZExtValue();
12596 else if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: BarOp))
12597 if (auto Addr = AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(
12598 GV: *GA->getGlobal()))
12599 BarVal = *Addr + GA->getOffset();
12600
12601 if (BarVal) {
12602 unsigned BarID = (*BarVal >> 4) & 0x3F;
12603 Ops.push_back(Elt: DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32));
12604 Ops.push_back(Elt: Chain);
12605 auto *NewMI = DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_SIGNAL_IMM, dl: DL,
12606 VTs: Op->getVTList(), Ops);
12607 return SDValue(NewMI, 0);
12608 }
12609 }
12610 [[fallthrough]];
12611 }
12612 case Intrinsic::amdgcn_s_barrier_init: {
12613 // these two intrinsics have two operands: barrier pointer and member count
12614 SDValue Chain = Op->getOperand(Num: 0);
12615 SmallVector<SDValue, 2> Ops;
12616 SDValue BarOp = Op->getOperand(Num: 2);
12617 SDValue CntOp = Op->getOperand(Num: 3);
12618 SDValue M0Val;
12619 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12620 ? AMDGPU::S_BARRIER_INIT_M0
12621 : AMDGPU::S_BARRIER_SIGNAL_M0;
12622 // extract the BarrierID from bits 4-9 of BarOp
12623 SDValue BarID;
12624 BarID = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
12625 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
12626 BarID =
12627 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: BarID,
12628 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
12629 0);
12630 // Member count should be put into M0[ShAmt:+6]
12631 // Barrier ID should be put into M0[5:0]
12632 M0Val =
12633 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: CntOp,
12634 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
12635 0);
12636 constexpr unsigned ShAmt = 16;
12637 M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: CntOp,
12638 N2: DAG.getShiftAmountConstant(Val: ShAmt, VT: MVT::i32, DL));
12639
12640 M0Val = SDValue(
12641 DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: M0Val, Op2: BarID), 0);
12642
12643 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
12644
12645 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
12646 return SDValue(NewMI, 0);
12647 }
12648 case Intrinsic::amdgcn_s_wakeup_barrier: {
12649 if (!Subtarget->hasSWakeupBarrier())
12650 return SDValue();
12651 [[fallthrough]];
12652 }
12653 case Intrinsic::amdgcn_s_barrier_join: {
12654 // these three intrinsics have one operand: barrier pointer
12655 SDValue Chain = Op->getOperand(Num: 0);
12656 SmallVector<SDValue, 2> Ops;
12657 SDValue BarOp = Op->getOperand(Num: 2);
12658 unsigned Opc;
12659
12660 if (isa<ConstantSDNode>(Val: BarOp)) {
12661 uint64_t BarVal = cast<ConstantSDNode>(Val&: BarOp)->getZExtValue();
12662 switch (IntrinsicID) {
12663 default:
12664 return SDValue();
12665 case Intrinsic::amdgcn_s_barrier_join:
12666 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12667 break;
12668 case Intrinsic::amdgcn_s_wakeup_barrier:
12669 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12670 break;
12671 }
12672 // extract the BarrierID from bits 4-9 of the immediate
12673 unsigned BarID = (BarVal >> 4) & 0x3F;
12674 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
12675 Ops.push_back(Elt: K);
12676 Ops.push_back(Elt: Chain);
12677 } else {
12678 switch (IntrinsicID) {
12679 default:
12680 return SDValue();
12681 case Intrinsic::amdgcn_s_barrier_join:
12682 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12683 break;
12684 case Intrinsic::amdgcn_s_wakeup_barrier:
12685 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12686 break;
12687 }
12688 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
12689 SDValue M0Val;
12690 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
12691 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
12692 M0Val =
12693 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
12694 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
12695 0);
12696 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
12697 }
12698
12699 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
12700 return SDValue(NewMI, 0);
12701 }
12702 case Intrinsic::amdgcn_s_prefetch_data:
12703 case Intrinsic::amdgcn_s_prefetch_inst: {
12704 // For non-global address space preserve the chain and remove the call.
12705 if (!AMDGPU::isFlatGlobalAddrSpace(AS: cast<MemSDNode>(Val&: Op)->getAddressSpace()))
12706 return Op.getOperand(i: 0);
12707 return Op;
12708 }
12709 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12710 SDValue Ops[] = {
12711 Chain, bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG),
12712 Op.getOperand(i: 3), // offset
12713 Op.getOperand(i: 4), // length
12714 };
12715
12716 MemSDNode *M = cast<MemSDNode>(Val&: Op);
12717 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_PREFETCH_DATA, dl: DL,
12718 VTList: Op->getVTList(), Ops, MemVT: M->getMemoryVT(),
12719 MMO: M->getMemOperand());
12720 }
12721 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12722 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12723 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12724 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
12725 SDValue Chain = Op->getOperand(Num: 0);
12726 SDValue Ptr = Op->getOperand(Num: 2);
12727 SDValue Val = Op->getOperand(Num: 3);
12728 return DAG.getAtomic(Opcode: ISD::ATOMIC_STORE, dl: DL, MemVT: MII->getMemoryVT(), Chain, Ptr: Val,
12729 Val: Ptr, MMO: MII->getMemOperand());
12730 }
12731 case Intrinsic::amdgcn_av_store_b128: {
12732 if (!Subtarget->hasFlatGlobalInsts()) {
12733 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
12734 DAG.getMachineFunction().getFunction(),
12735 "llvm.amdgcn.av.store.b128 not supported on subtarget",
12736 DL.getDebugLoc()));
12737 return Op->getOperand(Num: 0); // return the input chain
12738 }
12739 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
12740 SDValue Chain = Op->getOperand(Num: 0);
12741 SDValue Ptr = Op->getOperand(Num: 2);
12742 SDValue Val = Op->getOperand(Num: 3);
12743 return DAG.getStore(Chain, dl: DL, Val, Ptr, MMO: MII->getMemOperand());
12744 }
12745 default: {
12746 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12747 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
12748 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
12749
12750 return Op;
12751 }
12752 }
12753}
12754
12755// Return whether the operation has NoUnsignedWrap property.
12756static bool isNoUnsignedWrap(SDValue Addr) {
12757 return (Addr.getOpcode() == ISD::ADD &&
12758 Addr->getFlags().hasNoUnsignedWrap()) ||
12759 Addr->getOpcode() == ISD::OR;
12760}
12761
12762bool SITargetLowering::shouldPreservePtrArith(const Function &F,
12763 EVT PtrVT) const {
12764 return PtrVT == MVT::i64;
12765}
12766
12767bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
12768 EVT PtrVT) const {
12769 return true;
12770}
12771
12772// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
12773// offset (the offset that is included in bounds checking and swizzling, to be
12774// split between the instruction's voffset and immoffset fields) and soffset
12775// (the offset that is excluded from bounds checking and swizzling, to go in
12776// the instruction's soffset field). This function takes the first kind of
12777// offset and figures out how to split it between voffset and immoffset.
12778std::pair<SDValue, SDValue>
12779SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
12780 SDLoc DL(Offset);
12781 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
12782 SDValue N0 = Offset;
12783 ConstantSDNode *C1 = nullptr;
12784
12785 if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
12786 N0 = SDValue();
12787 else if (DAG.isBaseWithConstantOffset(Op: N0)) {
12788 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12789 // being added, so we can only safely match a 32-bit addition with no
12790 // unsigned overflow.
12791 bool CheckNUW = Subtarget->hasGFX1250Insts();
12792 if (!CheckNUW || isNoUnsignedWrap(Addr: N0)) {
12793 C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
12794 N0 = N0.getOperand(i: 0);
12795 }
12796 }
12797
12798 if (C1) {
12799 unsigned ImmOffset = C1->getZExtValue();
12800 // If the immediate value is too big for the immoffset field, put only bits
12801 // that would normally fit in the immoffset field. The remaining value that
12802 // is copied/added for the voffset field is a large power of 2, and it
12803 // stands more chance of being CSEd with the copy/add for another similar
12804 // load/store.
12805 // However, do not do that rounding down if that is a negative
12806 // number, as it appears to be illegal to have a negative offset in the
12807 // vgpr, even if adding the immediate offset makes it positive.
12808 unsigned Overflow = ImmOffset & ~MaxImm;
12809 ImmOffset -= Overflow;
12810 if ((int32_t)Overflow < 0) {
12811 Overflow += ImmOffset;
12812 ImmOffset = 0;
12813 }
12814 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32));
12815 if (Overflow) {
12816 auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32);
12817 if (!N0)
12818 N0 = OverflowVal;
12819 else {
12820 SDValue Ops[] = {N0, OverflowVal};
12821 N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops);
12822 }
12823 }
12824 }
12825 if (!N0)
12826 N0 = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12827 if (!C1)
12828 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
12829 return {N0, SDValue(C1, 0)};
12830}
12831
12832// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12833// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12834// pointed to by Offsets.
12835void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12836 SelectionDAG &DAG, SDValue *Offsets,
12837 Align Alignment) const {
12838 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12839 SDLoc DL(CombinedOffset);
12840 if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
12841 uint32_t Imm = C->getZExtValue();
12842 uint32_t SOffset, ImmOffset;
12843 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12844 Offsets[0] = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12845 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12846 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12847 return;
12848 }
12849 }
12850 if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
12851 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12852 // being added, so we can only safely match a 32-bit addition with no
12853 // unsigned overflow.
12854 bool CheckNUW = Subtarget->hasGFX1250Insts();
12855 SDValue N0 = CombinedOffset.getOperand(i: 0);
12856 SDValue N1 = CombinedOffset.getOperand(i: 1);
12857 uint32_t SOffset, ImmOffset;
12858 int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
12859 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(Addr: CombinedOffset)) &&
12860 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
12861 Offsets[0] = N0;
12862 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12863 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12864 return;
12865 }
12866 }
12867
12868 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12869 ? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
12870 : DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12871
12872 Offsets[0] = CombinedOffset;
12873 Offsets[1] = SOffsetZero;
12874 Offsets[2] = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
12875}
12876
12877SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12878 SelectionDAG &DAG) const {
12879 if (!MaybePointer.getValueType().isScalarInteger())
12880 return MaybePointer;
12881
12882 SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer);
12883 return Rsrc;
12884}
12885
12886// Wrap a global or flat pointer into a buffer intrinsic using the flags
12887// specified in the intrinsic.
12888SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12889 SelectionDAG &DAG) const {
12890 SDLoc Loc(Op);
12891
12892 SDValue Pointer = Op->getOperand(Num: 1);
12893 SDValue Stride = Op->getOperand(Num: 2);
12894 SDValue NumRecords = Op->getOperand(Num: 3);
12895 SDValue Flags = Op->getOperand(Num: 4);
12896
12897 SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32);
12898 SDValue Rsrc;
12899
12900 if (Subtarget->has45BitNumRecordsBufferResource()) {
12901 SDValue Zero = DAG.getConstant(Val: 0, DL: Loc, VT: MVT::i32);
12902 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12903 // num_records.
12904 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Op: Pointer, DL: Loc, VT: MVT::i64);
12905 SDValue NumRecordsLHS =
12906 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12907 N2: DAG.getShiftAmountConstant(Val: 57, VT: MVT::i32, DL: Loc));
12908 SDValue LowHalf =
12909 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: ExtPointer, N2: NumRecordsLHS);
12910
12911 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12912 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12913 SDValue NumRecordsRHS =
12914 DAG.getNode(Opcode: ISD::SRL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12915 N2: DAG.getShiftAmountConstant(Val: 7, VT: MVT::i32, DL: Loc));
12916 SDValue ShiftedStride =
12917 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12918 N2: DAG.getShiftAmountConstant(Val: 12, VT: MVT::i32, DL: Loc));
12919 SDValue ExtShiftedStrideVec =
12920 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedStride);
12921 SDValue ExtShiftedStride =
12922 DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedStrideVec);
12923 SDValue ShiftedFlags =
12924 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: Flags,
12925 N2: DAG.getShiftAmountConstant(Val: 28, VT: MVT::i32, DL: Loc));
12926 SDValue ExtShiftedFlagsVec =
12927 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedFlags);
12928 SDValue ExtShiftedFlags =
12929 DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedFlagsVec);
12930 SDValue CombinedFields =
12931 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: NumRecordsRHS, N2: ExtShiftedStride);
12932 SDValue HighHalf =
12933 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: CombinedFields, N2: ExtShiftedFlags);
12934
12935 Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i64, N1: LowHalf, N2: HighHalf);
12936 } else {
12937 NumRecords = DAG.getAnyExtOrTrunc(Op: NumRecords, DL: Loc, VT: MVT::i32);
12938 auto [LowHalf, HighHalf] =
12939 DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32);
12940 SDValue Mask = DAG.getConstant(Val: 0x0000ffff, DL: Loc, VT: MVT::i32);
12941 SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask);
12942 SDValue ShiftedStride =
12943 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12944 N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL: Loc));
12945 SDValue NewHighHalf =
12946 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride);
12947
12948 Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf, N2: NewHighHalf,
12949 N3: NumRecords, N4: Flags);
12950 }
12951
12952 SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc);
12953 return RsrcPtr;
12954}
12955
12956// Handle 8 bit and 16 bit buffer loads
12957SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12958 EVT LoadVT, SDLoc DL,
12959 ArrayRef<SDValue> Ops,
12960 MachineMemOperand *MMO,
12961 bool IsTFE) const {
12962 EVT IntVT = LoadVT.changeTypeToInteger();
12963
12964 if (IsTFE) {
12965 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12966 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12967 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12968 MachineFunction &MF = DAG.getMachineFunction();
12969 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 8);
12970 SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other);
12971 SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG);
12972 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12973 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
12974 SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12975 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32));
12976 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data);
12977 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc);
12978 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
12979 }
12980
12981 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12982 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12983 : AMDGPUISD::BUFFER_LOAD_USHORT;
12984
12985 SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
12986 SDValue BufferLoad =
12987 DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
12988 SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
12989 LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
12990
12991 return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL);
12992}
12993
12994// Handle 8 bit and 16 bit buffer stores
12995SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12996 EVT VDataType, SDLoc DL,
12997 SDValue Ops[],
12998 MemSDNode *M) const {
12999 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
13000 Ops[1] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[1]);
13001
13002 SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[1]);
13003 Ops[1] = BufferStoreExt;
13004 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
13005 : AMDGPUISD::BUFFER_STORE_SHORT;
13006 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
13007 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
13008 MMO: M->getMemOperand());
13009}
13010
13011static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
13012 SDValue Op, const SDLoc &SL, EVT VT) {
13013 if (VT.bitsLT(VT: Op.getValueType()))
13014 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
13015
13016 switch (ExtType) {
13017 case ISD::SEXTLOAD:
13018 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
13019 case ISD::ZEXTLOAD:
13020 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
13021 case ISD::EXTLOAD:
13022 return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
13023 case ISD::NON_EXTLOAD:
13024 return Op;
13025 }
13026
13027 llvm_unreachable("invalid ext type");
13028}
13029
13030// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
13031// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
13032SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
13033 DAGCombinerInfo &DCI) const {
13034 SelectionDAG &DAG = DCI.DAG;
13035 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
13036 return SDValue();
13037
13038 // FIXME: Constant loads should all be marked invariant.
13039 unsigned AS = Ld->getAddressSpace();
13040 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
13041 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
13042 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
13043 return SDValue();
13044
13045 // Don't do this early, since it may interfere with adjacent load merging for
13046 // illegal types. We can avoid losing alignment information for exotic types
13047 // pre-legalize.
13048 EVT MemVT = Ld->getMemoryVT();
13049 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
13050 MemVT.getSizeInBits() >= 32)
13051 return SDValue();
13052
13053 SDLoc SL(Ld);
13054
13055 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
13056 "unexpected vector extload");
13057
13058 // TODO: Drop only high part of range.
13059 SDValue Ptr = Ld->getBasePtr();
13060 SDValue NewLoad = DAG.getLoad(
13061 AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr,
13062 Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(),
13063 MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(),
13064 Ranges: nullptr); // Drop ranges
13065
13066 EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
13067 if (MemVT.isFloatingPoint()) {
13068 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
13069 "unexpected fp extload");
13070 TruncVT = MemVT.changeTypeToInteger();
13071 }
13072
13073 SDValue Cvt = NewLoad;
13074 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
13075 Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad,
13076 N2: DAG.getValueType(TruncVT));
13077 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
13078 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
13079 Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
13080 } else {
13081 assert(Ld->getExtensionType() == ISD::EXTLOAD);
13082 }
13083
13084 EVT VT = Ld->getValueType(ResNo: 0);
13085 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
13086
13087 DCI.AddToWorklist(N: Cvt.getNode());
13088
13089 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
13090 // the appropriate extension from the 32-bit load.
13091 Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
13092 DCI.AddToWorklist(N: Cvt.getNode());
13093
13094 // Handle conversion back to floating point if necessary.
13095 Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
13096
13097 return DAG.getMergeValues(Ops: {Cvt, NewLoad.getValue(R: 1)}, dl: SL);
13098}
13099
13100static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
13101 const SIMachineFunctionInfo &Info) {
13102 // TODO: Should check if the address can definitely not access stack.
13103 if (Info.isEntryFunction())
13104 return Info.getUserSGPRInfo().hasFlatScratchInit();
13105 return true;
13106}
13107
13108SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
13109 SDLoc DL(Op);
13110 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
13111 ISD::LoadExtType ExtType = Load->getExtensionType();
13112 EVT MemVT = Load->getMemoryVT();
13113 MachineMemOperand *MMO = Load->getMemOperand();
13114
13115 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
13116 if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16))
13117 return SDValue();
13118
13119 // FIXME: Copied from PPC
13120 // First, load into 32 bits, then truncate to 1 bit.
13121
13122 SDValue Chain = Load->getChain();
13123 SDValue BasePtr = Load->getBasePtr();
13124
13125 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
13126
13127 SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain, Ptr: BasePtr,
13128 MemVT: RealMemVT, MMO);
13129
13130 if (!MemVT.isVector()) {
13131 SDValue Ops[] = {DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
13132 NewLD.getValue(R: 1)};
13133
13134 return DAG.getMergeValues(Ops, dl: DL);
13135 }
13136
13137 SmallVector<SDValue, 3> Elts;
13138 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
13139 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD,
13140 N2: DAG.getConstant(Val: I, DL, VT: MVT::i32));
13141
13142 Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt));
13143 }
13144
13145 SDValue Ops[] = {DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), NewLD.getValue(R: 1)};
13146
13147 return DAG.getMergeValues(Ops, dl: DL);
13148 }
13149
13150 if (!MemVT.isVector())
13151 return SDValue();
13152
13153 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
13154 "Custom lowering for non-i32 vectors hasn't been implemented.");
13155
13156 Align Alignment = Load->getAlign();
13157 unsigned AS = Load->getAddressSpace();
13158 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13159 AS == AMDGPUAS::FLAT_ADDRESS &&
13160 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
13161 return SplitVectorLoad(Op, DAG);
13162 }
13163
13164 MachineFunction &MF = DAG.getMachineFunction();
13165 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13166 // If there is a possibility that flat instruction access scratch memory
13167 // then we need to use the same legalization rules we use for private.
13168 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13169 !Subtarget->hasMultiDwordFlatScratchAddressing())
13170 AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI)
13171 ? AMDGPUAS::PRIVATE_ADDRESS
13172 : AMDGPUAS::GLOBAL_ADDRESS;
13173
13174 unsigned NumElements = MemVT.getVectorNumElements();
13175
13176 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
13177 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
13178 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
13179 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
13180 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(N: Load)))) {
13181 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
13182 Alignment >= Align(4) && NumElements < 32) {
13183 if (MemVT.isPow2VectorType() ||
13184 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
13185 return SDValue();
13186 return WidenOrSplitVectorLoad(Op, DAG);
13187 }
13188 // Non-uniform loads will be selected to MUBUF instructions, so they
13189 // have the same legalization requirements as global and private
13190 // loads.
13191 //
13192 }
13193 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
13194 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
13195 AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
13196 if (NumElements > 4)
13197 return SplitVectorLoad(Op, DAG);
13198 // v3 loads not supported on SI.
13199 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13200 return WidenOrSplitVectorLoad(Op, DAG);
13201
13202 // v3 and v4 loads are supported for private and global memory.
13203 return SDValue();
13204 }
13205 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13206 // Depending on the setting of the private_element_size field in the
13207 // resource descriptor, we can only make private accesses up to a certain
13208 // size.
13209 switch (Subtarget->getMaxPrivateElementSize()) {
13210 case 4: {
13211 auto [Op0, Op1] = scalarizeVectorLoad(LD: Load, DAG);
13212 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
13213 }
13214 case 8:
13215 if (NumElements > 2)
13216 return SplitVectorLoad(Op, DAG);
13217 return SDValue();
13218 case 16:
13219 // Same as global/flat
13220 if (NumElements > 4)
13221 return SplitVectorLoad(Op, DAG);
13222 // v3 loads not supported on SI.
13223 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13224 return WidenOrSplitVectorLoad(Op, DAG);
13225
13226 return SDValue();
13227 default:
13228 llvm_unreachable("unsupported private_element_size");
13229 }
13230 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13231 unsigned Fast = 0;
13232 auto Flags = Load->getMemOperand()->getFlags();
13233 if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
13234 Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
13235 Fast > 1)
13236 return SDValue();
13237
13238 if (MemVT.isVector())
13239 return SplitVectorLoad(Op, DAG);
13240 }
13241
13242 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
13243 VT: MemVT, MMO: *Load->getMemOperand())) {
13244 auto [Op0, Op1] = expandUnalignedLoad(LD: Load, DAG);
13245 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
13246 }
13247
13248 return SDValue();
13249}
13250
13251SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
13252 EVT VT = Op.getValueType();
13253 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
13254 VT.getSizeInBits() == 512)
13255 return splitTernaryVectorOp(Op, DAG);
13256
13257 assert(VT.getSizeInBits() == 64);
13258
13259 SDLoc DL(Op);
13260 SDValue Cond = DAG.getFreeze(V: Op.getOperand(i: 0));
13261
13262 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
13263 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
13264
13265 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
13266 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 2));
13267
13268 SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero);
13269 SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero);
13270
13271 SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1);
13272
13273 SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One);
13274 SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One);
13275
13276 SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1);
13277
13278 SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi});
13279 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
13280}
13281
13282// Catch division cases where we can use shortcuts with rcp and rsq
13283// instructions.
13284SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
13285 SelectionDAG &DAG) const {
13286 SDLoc SL(Op);
13287 SDValue LHS = Op.getOperand(i: 0);
13288 SDValue RHS = Op.getOperand(i: 1);
13289 EVT VT = Op.getValueType();
13290 const SDNodeFlags Flags = Op->getFlags();
13291
13292 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
13293
13294 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
13295 // Without !fpmath accuracy information, we can't do more because we don't
13296 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
13297 // f16 is always accurate enough
13298 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
13299 return SDValue();
13300
13301 if (CLHS->isOne()) {
13302 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
13303 // the CI documentation has a worst case error of 1 ulp.
13304 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
13305 // use it as long as we aren't trying to use denormals.
13306 //
13307 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
13308
13309 // 1.0 / sqrt(x) -> rsq(x)
13310
13311 // XXX - Is afn sufficient to do this for f64? The maximum ULP
13312 // error seems really high at 2^29 ULP.
13313 // 1.0 / x -> rcp(x)
13314 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
13315 }
13316
13317 // Same as for 1.0, but expand the sign out of the constant.
13318 if (CLHS->isMinusOne()) {
13319 // -1.0 / x -> rcp (fneg x)
13320 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
13321 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
13322 }
13323 }
13324
13325 // For f16 and bf16 require afn or arcp.
13326 // For f32 require afn.
13327 if (!AllowInaccurateRcp &&
13328 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
13329 return SDValue();
13330
13331 // Turn into multiply by the reciprocal.
13332 // x / y -> x * (1.0 / y)
13333 SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
13334 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
13335}
13336
13337SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
13338 SelectionDAG &DAG) const {
13339 SDLoc SL(Op);
13340 SDValue X = Op.getOperand(i: 0);
13341 SDValue Y = Op.getOperand(i: 1);
13342 EVT VT = Op.getValueType();
13343 const SDNodeFlags Flags = Op->getFlags();
13344
13345 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
13346 if (!AllowInaccurateDiv)
13347 return SDValue();
13348
13349 const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: X);
13350 bool IsNegRcp = CLHS && CLHS->isMinusOne();
13351
13352 // Pull out the negation so it folds for free into the source modifiers.
13353 if (IsNegRcp)
13354 X = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
13355
13356 SDValue NegY = IsNegRcp ? Y : DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
13357 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
13358
13359 SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
13360 if (IsNegRcp)
13361 R = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: R);
13362
13363 SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
13364
13365 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
13366 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
13367 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
13368
13369 // Skip the last 2 correction terms for reciprocal.
13370 if (IsNegRcp || (CLHS && CLHS->isOne()))
13371 return R;
13372
13373 SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
13374 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
13375 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
13376}
13377
13378static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
13379 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
13380 SDNodeFlags Flags) {
13381 if (GlueChain->getNumValues() <= 1) {
13382 return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
13383 }
13384
13385 assert(GlueChain->getNumValues() == 3);
13386
13387 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
13388 switch (Opcode) {
13389 default:
13390 llvm_unreachable("no chain equivalent for opcode");
13391 case ISD::FMUL:
13392 Opcode = AMDGPUISD::FMUL_W_CHAIN;
13393 break;
13394 }
13395
13396 return DAG.getNode(Opcode, DL: SL, VTList,
13397 Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)},
13398 Flags);
13399}
13400
13401static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
13402 EVT VT, SDValue A, SDValue B, SDValue C,
13403 SDValue GlueChain, SDNodeFlags Flags) {
13404 if (GlueChain->getNumValues() <= 1) {
13405 return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
13406 }
13407
13408 assert(GlueChain->getNumValues() == 3);
13409
13410 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
13411 switch (Opcode) {
13412 default:
13413 llvm_unreachable("no chain equivalent for opcode");
13414 case ISD::FMA:
13415 Opcode = AMDGPUISD::FMA_W_CHAIN;
13416 break;
13417 }
13418
13419 return DAG.getNode(Opcode, DL: SL, VTList,
13420 Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)},
13421 Flags);
13422}
13423
13424SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
13425 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
13426 return FastLowered;
13427
13428 SDLoc SL(Op);
13429 EVT VT = Op.getValueType();
13430 SDValue LHS = Op.getOperand(i: 0);
13431 SDValue RHS = Op.getOperand(i: 1);
13432
13433 SDValue LHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: LHS);
13434 SDValue RHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: RHS);
13435
13436 if (VT == MVT::bf16) {
13437 SDValue ExtDiv =
13438 DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT: MVT::f32, N1: LHSExt, N2: RHSExt, Flags: Op->getFlags());
13439 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ExtDiv,
13440 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
13441 }
13442
13443 assert(VT == MVT::f16);
13444
13445 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
13446 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
13447 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
13448 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
13449 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
13450 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
13451 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
13452 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
13453 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
13454 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
13455 // q16.u = opx(V_CVT_F16_F32, q32.u);
13456 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
13457
13458 // We will use ISD::FMA on targets that don't support ISD::FMAD.
13459 unsigned FMADOpCode =
13460 isOperationLegal(Op: ISD::FMAD, VT: MVT::f32) ? ISD::FMAD : ISD::FMA;
13461 SDValue NegRHSExt = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHSExt);
13462 SDValue Rcp =
13463 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: RHSExt, Flags: Op->getFlags());
13464 SDValue Quot =
13465 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHSExt, N2: Rcp, Flags: Op->getFlags());
13466 SDValue Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
13467 Flags: Op->getFlags());
13468 Quot = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, N3: Quot, Flags: Op->getFlags());
13469 Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
13470 Flags: Op->getFlags());
13471 SDValue Tmp = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, Flags: Op->getFlags());
13472 SDValue TmpCast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Tmp);
13473 TmpCast = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TmpCast,
13474 N2: DAG.getConstant(Val: 0xff800000, DL: SL, VT: MVT::i32));
13475 Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: TmpCast);
13476 Quot = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f32, N1: Tmp, N2: Quot, Flags: Op->getFlags());
13477 SDValue RDst = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot,
13478 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
13479 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: RDst, N2: RHS, N3: LHS,
13480 Flags: Op->getFlags());
13481}
13482
13483// Faster 2.5 ULP division that does not support denormals.
13484SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
13485 SDNodeFlags Flags = Op->getFlags();
13486 SDLoc SL(Op);
13487 SDValue LHS = Op.getOperand(i: 1);
13488 SDValue RHS = Op.getOperand(i: 2);
13489
13490 // TODO: The combiner should probably handle elimination of redundant fabs.
13491 SDValue r1 = DAG.SignBitIsZeroFP(Op: RHS)
13492 ? RHS
13493 : DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags);
13494
13495 const APFloat K0Val(0x1p+96f);
13496 const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32);
13497
13498 const APFloat K1Val(0x1p-32f);
13499 const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32);
13500
13501 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
13502
13503 EVT SetCCVT =
13504 getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32);
13505
13506 SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
13507
13508 SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags);
13509
13510 r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags);
13511
13512 // rcp does not support denormals.
13513 SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags);
13514
13515 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags);
13516
13517 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags);
13518}
13519
13520// Returns immediate value for setting the F32 denorm mode when using the
13521// S_DENORM_MODE instruction.
13522static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
13523 const SIMachineFunctionInfo *Info,
13524 const GCNSubtarget *ST) {
13525 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
13526 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
13527 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
13528 return DAG.getTargetConstant(Val: Mode, DL: SDLoc(), VT: MVT::i32);
13529}
13530
13531SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
13532 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
13533 return FastLowered;
13534
13535 // The selection matcher assumes anything with a chain selecting to a
13536 // mayRaiseFPException machine instruction. Since we're introducing a chain
13537 // here, we need to explicitly report nofpexcept for the regular fdiv
13538 // lowering.
13539 SDNodeFlags Flags = Op->getFlags();
13540 Flags.setNoFPExcept(true);
13541
13542 SDLoc SL(Op);
13543 SDValue LHS = Op.getOperand(i: 0);
13544 SDValue RHS = Op.getOperand(i: 1);
13545
13546 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
13547
13548 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1);
13549
13550 SDValue DenominatorScaled =
13551 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {RHS, RHS, LHS}, Flags);
13552 SDValue NumeratorScaled =
13553 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {LHS, RHS, LHS}, Flags);
13554
13555 // Denominator is scaled to not be denormal, so using rcp is ok.
13556 SDValue ApproxRcp =
13557 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
13558 SDValue NegDivScale0 =
13559 DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
13560
13561 using namespace AMDGPU::Hwreg;
13562 const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: 4, Values: 2);
13563 const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32);
13564
13565 const MachineFunction &MF = DAG.getMachineFunction();
13566 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13567 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
13568
13569 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
13570 const bool HasDynamicDenormals =
13571 (DenormMode.Input == DenormalMode::Dynamic) ||
13572 (DenormMode.Output == DenormalMode::Dynamic);
13573
13574 SDValue SavedDenormMode;
13575
13576 if (!PreservesDenormals) {
13577 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
13578 // lowering. The chain dependence is insufficient, and we need glue. We do
13579 // not need the glue variants in a strictfp function.
13580
13581 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
13582
13583 SDValue Glue = DAG.getEntryNode();
13584 if (HasDynamicDenormals) {
13585 SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL,
13586 VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue),
13587 Ops: {BitField, Glue});
13588 SavedDenormMode = SDValue(GetReg, 0);
13589
13590 Glue = DAG.getMergeValues(
13591 Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL);
13592 }
13593
13594 SDNode *EnableDenorm;
13595 if (Subtarget->hasDenormModeInst()) {
13596 const SDValue EnableDenormValue =
13597 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
13598
13599 EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
13600 N2: EnableDenormValue)
13601 .getNode();
13602 } else {
13603 const SDValue EnableDenormValue =
13604 DAG.getConstant(FP_DENORM_FLUSH_NONE, DL: SL, VT: MVT::i32);
13605 EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs,
13606 Ops: {EnableDenormValue, BitField, Glue});
13607 }
13608
13609 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
13610 SDValue(EnableDenorm, 1)};
13611
13612 NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
13613 }
13614
13615 SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0,
13616 B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags);
13617
13618 SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp,
13619 C: ApproxRcp, GlueChain: Fma0, Flags);
13620
13621 SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled, B: Fma1,
13622 GlueChain: Fma1, Flags);
13623
13624 SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul,
13625 C: NumeratorScaled, GlueChain: Mul, Flags);
13626
13627 SDValue Fma3 =
13628 getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags);
13629
13630 SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3,
13631 C: NumeratorScaled, GlueChain: Fma3, Flags);
13632
13633 if (!PreservesDenormals) {
13634 SDNode *DisableDenorm;
13635 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13636 const SDValue DisableDenormValue = getSPDenormModeValue(
13637 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
13638
13639 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
13640 DisableDenorm =
13641 DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs,
13642 N1: Fma4.getValue(R: 1), N2: DisableDenormValue, N3: Fma4.getValue(R: 2))
13643 .getNode();
13644 } else {
13645 assert(HasDynamicDenormals == (bool)SavedDenormMode);
13646 const SDValue DisableDenormValue =
13647 HasDynamicDenormals
13648 ? SavedDenormMode
13649 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32);
13650
13651 DisableDenorm = DAG.getMachineNode(
13652 Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other,
13653 Ops: {DisableDenormValue, BitField, Fma4.getValue(R: 1), Fma4.getValue(R: 2)});
13654 }
13655
13656 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
13657 N1: SDValue(DisableDenorm, 0), N2: DAG.getRoot());
13658 DAG.setRoot(OutputChain);
13659 }
13660
13661 SDValue Scale = NumeratorScaled.getValue(R: 1);
13662 SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32,
13663 Ops: {Fma4, Fma1, Fma3, Scale}, Flags);
13664
13665 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags);
13666}
13667
13668SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
13669 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
13670 return FastLowered;
13671
13672 SDLoc SL(Op);
13673 SDValue X = Op.getOperand(i: 0);
13674 SDValue Y = Op.getOperand(i: 1);
13675
13676 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
13677
13678 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1);
13679
13680 SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
13681
13682 SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0);
13683
13684 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0);
13685
13686 SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One);
13687
13688 SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp);
13689
13690 SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One);
13691
13692 SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
13693
13694 SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1);
13695 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3);
13696
13697 SDValue Fma4 =
13698 DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Mul, N3: DivScale1);
13699
13700 SDValue Scale;
13701
13702 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13703 // Workaround a hardware bug on SI where the condition output from div_scale
13704 // is not usable.
13705
13706 const SDValue Hi = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
13707
13708 // Figure out if the scale to use for div_fmas.
13709 SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X);
13710 SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y);
13711 SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0);
13712 SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1);
13713
13714 SDValue NumHi =
13715 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi);
13716 SDValue DenHi =
13717 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi);
13718
13719 SDValue Scale0Hi =
13720 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi);
13721 SDValue Scale1Hi =
13722 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi);
13723
13724 SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ);
13725 SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ);
13726 Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen);
13727 } else {
13728 Scale = DivScale1.getValue(R: 1);
13729 }
13730
13731 SDValue Fmas =
13732 DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64, N1: Fma4, N2: Fma3, N3: Mul, N4: Scale);
13733
13734 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X);
13735}
13736
13737SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
13738 EVT VT = Op.getValueType();
13739
13740 if (VT == MVT::f32)
13741 return LowerFDIV32(Op, DAG);
13742
13743 if (VT == MVT::f64)
13744 return LowerFDIV64(Op, DAG);
13745
13746 if (VT == MVT::f16 || VT == MVT::bf16)
13747 return LowerFDIV16(Op, DAG);
13748
13749 llvm_unreachable("Unexpected type for fdiv");
13750}
13751
13752SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
13753 SDLoc dl(Op);
13754 SDValue Val = Op.getOperand(i: 0);
13755 EVT VT = Val.getValueType();
13756 EVT ResultExpVT = Op->getValueType(ResNo: 1);
13757 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13758
13759 SDValue Mant = DAG.getNode(
13760 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
13761 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val);
13762
13763 SDValue Exp = DAG.getNode(
13764 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT,
13765 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val);
13766
13767 if (Subtarget->hasFractBug()) {
13768 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
13769 SDValue Inf =
13770 DAG.getConstantFP(Val: APFloat::getInf(Sem: VT.getFltSemantics()), DL: dl, VT);
13771
13772 SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT);
13773 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT);
13774 Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
13775 Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
13776 }
13777
13778 SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
13779 return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
13780}
13781
13782SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
13783 SDLoc DL(Op);
13784 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
13785 EVT VT = Store->getMemoryVT();
13786
13787 if (VT == MVT::i1) {
13788 return DAG.getTruncStore(
13789 Chain: Store->getChain(), dl: DL,
13790 Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32),
13791 Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand());
13792 }
13793
13794 assert(VT.isVector() &&
13795 Store->getValue().getValueType().getScalarType() == MVT::i32);
13796
13797 unsigned AS = Store->getAddressSpace();
13798 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13799 AS == AMDGPUAS::FLAT_ADDRESS &&
13800 Store->getAlign().value() < VT.getStoreSize() &&
13801 VT.getSizeInBits() > 32) {
13802 return SplitVectorStore(Op, DAG);
13803 }
13804
13805 MachineFunction &MF = DAG.getMachineFunction();
13806 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13807 // If there is a possibility that flat instruction access scratch memory
13808 // then we need to use the same legalization rules we use for private.
13809 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13810 !Subtarget->hasMultiDwordFlatScratchAddressing())
13811 AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI)
13812 ? AMDGPUAS::PRIVATE_ADDRESS
13813 : AMDGPUAS::GLOBAL_ADDRESS;
13814
13815 unsigned NumElements = VT.getVectorNumElements();
13816 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
13817 if (NumElements > 4)
13818 return SplitVectorStore(Op, DAG);
13819 // v3 stores not supported on SI.
13820 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13821 return SplitVectorStore(Op, DAG);
13822
13823 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
13824 VT, MMO: *Store->getMemOperand()))
13825 return expandUnalignedStore(ST: Store, DAG);
13826
13827 return SDValue();
13828 }
13829 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13830 switch (Subtarget->getMaxPrivateElementSize()) {
13831 case 4:
13832 return scalarizeVectorStore(ST: Store, DAG);
13833 case 8:
13834 if (NumElements > 2)
13835 return SplitVectorStore(Op, DAG);
13836 return SDValue();
13837 case 16:
13838 if (NumElements > 4 ||
13839 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13840 return SplitVectorStore(Op, DAG);
13841 return SDValue();
13842 default:
13843 llvm_unreachable("unsupported private_element_size");
13844 }
13845 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13846 unsigned Fast = 0;
13847 auto Flags = Store->getMemOperand()->getFlags();
13848 if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
13849 Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
13850 Fast > 1)
13851 return SDValue();
13852
13853 if (VT.isVector())
13854 return SplitVectorStore(Op, DAG);
13855
13856 return expandUnalignedStore(ST: Store, DAG);
13857 }
13858
13859 // Probably an invalid store. If so we'll end up emitting a selection error.
13860 return SDValue();
13861}
13862
13863// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13864SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13865 SDLoc SL(Op);
13866 assert(!Subtarget->has16BitInsts());
13867 SDNodeFlags Flags = Op->getFlags();
13868 SDValue Ext =
13869 DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: 0), Flags);
13870
13871 SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32);
13872 SDValue Sqrt =
13873 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags);
13874
13875 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt,
13876 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
13877}
13878
13879SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13880 SDLoc DL(Op);
13881 SDNodeFlags Flags = Op->getFlags();
13882 MVT VT = Op.getValueType().getSimpleVT();
13883 const SDValue X = Op.getOperand(i: 0);
13884
13885 if (allowApproxFunc(DAG, Flags)) {
13886 // Instruction is 1ulp but ignores denormals.
13887 return DAG.getNode(
13888 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
13889 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags);
13890 }
13891
13892 SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT);
13893 SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT);
13894
13895 SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT);
13896
13897 SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
13898
13899 SDValue SqrtX =
13900 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
13901
13902 SDValue SqrtS;
13903 if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
13904 SDValue SqrtID =
13905 DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32);
13906 SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
13907
13908 SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS);
13909 SDValue SqrtSNextDownInt =
13910 DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13911 N2: DAG.getAllOnesConstant(DL, VT: MVT::i32));
13912 SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
13913
13914 SDValue NegSqrtSNextDown =
13915 DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
13916
13917 SDValue SqrtVP =
13918 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
13919
13920 SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13921 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
13922 SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
13923
13924 SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
13925 SDValue SqrtVS =
13926 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
13927
13928 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
13929 SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE);
13930
13931 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
13932 Flags);
13933
13934 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT);
13935 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
13936 Flags);
13937 } else {
13938 SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
13939
13940 SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
13941
13942 SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT);
13943 SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
13944 SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
13945
13946 SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
13947 SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
13948 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
13949
13950 SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
13951 SDValue SqrtD =
13952 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
13953 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
13954 }
13955
13956 SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT);
13957
13958 SDValue ScaledDown =
13959 DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
13960
13961 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
13962 SDValue IsZeroOrInf =
13963 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13964 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
13965
13966 return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
13967}
13968
13969SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13970 // For double type, the SQRT and RSQ instructions don't have required
13971 // precision, we apply Goldschmidt's algorithm to improve the result:
13972 //
13973 // y0 = rsq(x)
13974 // g0 = x * y0
13975 // h0 = 0.5 * y0
13976 //
13977 // r0 = 0.5 - h0 * g0
13978 // g1 = g0 * r0 + g0
13979 // h1 = h0 * r0 + h0
13980 //
13981 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13982 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13983 // h2 = h1 * r1 + h1
13984 //
13985 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13986 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13987 //
13988 // sqrt(x) = g3
13989
13990 SDNodeFlags Flags = Op->getFlags();
13991
13992 SDLoc DL(Op);
13993
13994 SDValue X = Op.getOperand(i: 0);
13995 SDValue ZeroInt = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
13996
13997 SDValue SqrtX = X;
13998 SDValue Scaling;
13999 if (!Flags.hasApproximateFuncs()) {
14000 SDValue ScaleConstant = DAG.getConstantFP(Val: 0x1.0p-767, DL, VT: MVT::f64);
14001 Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT);
14002
14003 // Scale up input if it is too small.
14004 SDValue ScaleUpFactor = DAG.getConstant(Val: 256, DL, VT: MVT::i32);
14005 SDValue ScaleUp =
14006 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt);
14007 SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags);
14008 }
14009
14010 SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX);
14011
14012 SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY);
14013
14014 SDValue Half = DAG.getConstantFP(Val: 0.5, DL, VT: MVT::f64);
14015 SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half);
14016
14017 SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0);
14018 SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half);
14019
14020 SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0);
14021
14022 SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0);
14023
14024 SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1);
14025 SDValue SqrtD0 =
14026 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX);
14027
14028 SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1);
14029
14030 SDValue SqrtRet = SqrtS2;
14031 if (!Flags.hasApproximateFuncs()) {
14032 SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2);
14033 SDValue SqrtD1 =
14034 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX);
14035
14036 SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2);
14037
14038 SDValue ScaleDownFactor = DAG.getSignedConstant(Val: -128, DL, VT: MVT::i32);
14039 SDValue ScaleDown = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling,
14040 N2: ScaleDownFactor, N3: ZeroInt);
14041 SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags);
14042 }
14043
14044 // TODO: Check for DAZ and expand to subnormals
14045
14046 SDValue IsZeroOrInf;
14047 if (Flags.hasNoInfs()) {
14048 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL, VT: MVT::f64);
14049 IsZeroOrInf = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtX, RHS: Zero, Cond: ISD::SETOEQ);
14050 } else {
14051 IsZeroOrInf =
14052 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
14053 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
14054 }
14055
14056 // If x is +INF, +0, or -0, use its original value
14057 return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet,
14058 Flags);
14059}
14060
14061SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
14062 SDLoc DL(Op);
14063 EVT VT = Op.getValueType();
14064 SDValue Arg = Op.getOperand(i: 0);
14065 SDValue TrigVal;
14066
14067 // Propagate fast-math flags so that the multiply we introduce can be folded
14068 // if Arg is already the result of a multiply by constant.
14069 auto Flags = Op->getFlags();
14070
14071 // AMDGPUISD nodes of vector type must be unrolled here since
14072 // they will not be expanded elsewhere.
14073 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
14074 if (!V.getValueType().isVector())
14075 return V;
14076
14077 return DAG.UnrollVectorOp(N: cast<SDNode>(Val&: V));
14078 };
14079
14080 SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT);
14081
14082 if (Subtarget->hasTrigReducedRange()) {
14083 SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
14084 TrigVal = UnrollIfVec(DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags));
14085 } else {
14086 TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
14087 }
14088
14089 switch (Op.getOpcode()) {
14090 case ISD::FCOS:
14091 TrigVal = DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
14092 break;
14093 case ISD::FSIN:
14094 TrigVal = DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
14095 break;
14096 default:
14097 llvm_unreachable("Wrong trig opcode");
14098 }
14099
14100 return UnrollIfVec(TrigVal);
14101}
14102
14103SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
14104 SelectionDAG &DAG) const {
14105 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
14106 assert(AtomicNode->isCompareAndSwap());
14107 unsigned AS = AtomicNode->getAddressSpace();
14108
14109 // No custom lowering required for local address space
14110 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
14111 return Op;
14112
14113 // Non-local address space requires custom lowering for atomic compare
14114 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
14115 SDLoc DL(Op);
14116 SDValue ChainIn = Op.getOperand(i: 0);
14117 SDValue Addr = Op.getOperand(i: 1);
14118 SDValue Old = Op.getOperand(i: 2);
14119 SDValue New = Op.getOperand(i: 3);
14120 EVT VT = Op.getValueType();
14121 MVT SimpleVT = VT.getSimpleVT();
14122 MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2);
14123
14124 SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
14125 SDValue Ops[] = {ChainIn, Addr, NewOld};
14126
14127 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL,
14128 VTList: Op->getVTList(), Ops, MemVT: VT,
14129 MMO: AtomicNode->getMemOperand());
14130}
14131
14132//===----------------------------------------------------------------------===//
14133// Custom DAG optimizations
14134//===----------------------------------------------------------------------===//
14135
14136SDValue
14137SITargetLowering::performUCharToFloatCombine(SDNode *N,
14138 DAGCombinerInfo &DCI) const {
14139 EVT VT = N->getValueType(ResNo: 0);
14140 EVT ScalarVT = VT.getScalarType();
14141 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
14142 return SDValue();
14143
14144 SelectionDAG &DAG = DCI.DAG;
14145 SDLoc DL(N);
14146
14147 SDValue Src = N->getOperand(Num: 0);
14148 EVT SrcVT = Src.getValueType();
14149
14150 // TODO: We could try to match extracting the higher bytes, which would be
14151 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
14152 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
14153 // about in practice.
14154 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
14155 if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) {
14156 SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src);
14157 DCI.AddToWorklist(N: Cvt.getNode());
14158
14159 // For the f16 case, fold to a cast to f32 and then cast back to f16.
14160 if (ScalarVT != MVT::f32) {
14161 Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt,
14162 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
14163 }
14164 return Cvt;
14165 }
14166 }
14167
14168 return SDValue();
14169}
14170
14171SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
14172 DAGCombinerInfo &DCI) const {
14173 SDValue MagnitudeOp = N->getOperand(Num: 0);
14174 SDValue SignOp = N->getOperand(Num: 1);
14175
14176 // The generic combine for fcopysign + fp cast is too conservative with
14177 // vectors, and also gets confused by the splitting we will perform here, so
14178 // peek through FP casts.
14179 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
14180 SignOp.getOpcode() == ISD::FP_ROUND)
14181 SignOp = SignOp.getOperand(i: 0);
14182
14183 SelectionDAG &DAG = DCI.DAG;
14184 SDLoc DL(N);
14185 EVT SignVT = SignOp.getValueType();
14186
14187 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
14188 // lower half with a copy.
14189 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
14190 EVT MagVT = MagnitudeOp.getValueType();
14191
14192 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
14193
14194 if (MagVT.getScalarType() == MVT::f64) {
14195 EVT F32VT = MagVT.isVector()
14196 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
14197 : MVT::v2f32;
14198
14199 SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: MagnitudeOp);
14200
14201 SmallVector<SDValue, 8> NewElts;
14202 for (unsigned I = 0; I != NumElts; ++I) {
14203 SDValue MagLo =
14204 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
14205 N2: DAG.getConstant(Val: 2 * I, DL, VT: MVT::i32));
14206 SDValue MagHi =
14207 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
14208 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
14209
14210 SDValue SignOpElt =
14211 MagVT.isVector()
14212 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: SignVT.getScalarType(),
14213 N1: SignOp, N2: DAG.getConstant(Val: I, DL, VT: MVT::i32))
14214 : SignOp;
14215
14216 SDValue HiOp =
14217 DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOpElt);
14218
14219 SDValue Vector =
14220 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp);
14221
14222 SDValue NewElt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector);
14223 NewElts.push_back(Elt: NewElt);
14224 }
14225
14226 if (NewElts.size() == 1)
14227 return NewElts[0];
14228
14229 return DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MagVT, Ops: NewElts);
14230 }
14231
14232 if (SignVT.getScalarType() != MVT::f64)
14233 return SDValue();
14234
14235 // Reduce width of sign operand, we only need the highest bit.
14236 //
14237 // fcopysign f64:x, f64:y ->
14238 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
14239 // TODO: In some cases it might make sense to go all the way to f16.
14240
14241 EVT F32VT = MagVT.isVector()
14242 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
14243 : MVT::v2f32;
14244
14245 SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: SignOp);
14246
14247 SmallVector<SDValue, 8> F32Signs;
14248 for (unsigned I = 0; I != NumElts; ++I) {
14249 // Take sign from odd elements of cast vector
14250 SDValue SignAsF32 =
14251 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector,
14252 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
14253 F32Signs.push_back(Elt: SignAsF32);
14254 }
14255
14256 SDValue NewSign =
14257 NumElts == 1
14258 ? F32Signs.back()
14259 : DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL,
14260 VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: NumElts),
14261 Ops: F32Signs);
14262
14263 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
14264 N2: NewSign);
14265}
14266
14267// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
14268// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
14269// bits
14270
14271// This is a variant of
14272// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
14273//
14274// The normal DAG combiner will do this, but only if the add has one use since
14275// that would increase the number of instructions.
14276//
14277// This prevents us from seeing a constant offset that can be folded into a
14278// memory instruction's addressing mode. If we know the resulting add offset of
14279// a pointer can be folded into an addressing offset, we can replace the pointer
14280// operand with the add of new constant offset. This eliminates one of the uses,
14281// and may allow the remaining use to also be simplified.
14282//
14283SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
14284 EVT MemVT,
14285 DAGCombinerInfo &DCI) const {
14286 SDValue N0 = N->getOperand(Num: 0);
14287 SDValue N1 = N->getOperand(Num: 1);
14288
14289 // We only do this to handle cases where it's profitable when there are
14290 // multiple uses of the add, so defer to the standard combine.
14291 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
14292 return SDValue();
14293
14294 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
14295 if (!CN1)
14296 return SDValue();
14297
14298 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
14299 if (!CAdd)
14300 return SDValue();
14301
14302 SelectionDAG &DAG = DCI.DAG;
14303
14304 if (N0->getOpcode() == ISD::OR &&
14305 !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1)))
14306 return SDValue();
14307
14308 // If the resulting offset is too large, we can't fold it into the
14309 // addressing mode offset.
14310 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
14311 Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext());
14312
14313 AddrMode AM;
14314 AM.HasBaseReg = true;
14315 AM.BaseOffs = Offset.getSExtValue();
14316 if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
14317 return SDValue();
14318
14319 SDLoc SL(N);
14320 EVT VT = N->getValueType(ResNo: 0);
14321
14322 SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1);
14323 SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
14324
14325 SDNodeFlags Flags;
14326 Flags.setNoUnsignedWrap(
14327 N->getFlags().hasNoUnsignedWrap() &&
14328 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
14329
14330 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
14331 // be sure that the new left operand is a proper base pointer.
14332 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
14333}
14334
14335/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
14336/// by the chain and intrinsic ID. Theoretically we would also need to check the
14337/// specific intrinsic, but they all place the pointer operand first.
14338static unsigned getBasePtrIndex(const MemSDNode *N) {
14339 switch (N->getOpcode()) {
14340 case ISD::STORE:
14341 case ISD::INTRINSIC_W_CHAIN:
14342 case ISD::INTRINSIC_VOID:
14343 return 2;
14344 default:
14345 return 1;
14346 }
14347}
14348
14349SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
14350 DAGCombinerInfo &DCI) const {
14351 SelectionDAG &DAG = DCI.DAG;
14352
14353 unsigned PtrIdx = getBasePtrIndex(N);
14354 SDValue Ptr = N->getOperand(Num: PtrIdx);
14355
14356 // TODO: We could also do this for multiplies.
14357 if (Ptr.getOpcode() == ISD::SHL) {
14358 SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
14359 MemVT: N->getMemoryVT(), DCI);
14360 if (NewPtr) {
14361 SmallVector<SDValue, 8> NewOps(N->ops());
14362
14363 NewOps[PtrIdx] = NewPtr;
14364 return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0);
14365 }
14366 }
14367
14368 return SDValue();
14369}
14370
14371static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
14372 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
14373 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
14374 (Opc == ISD::XOR && Val == 0);
14375}
14376
14377// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
14378// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
14379// integer combine opportunities since most 64-bit operations are decomposed
14380// this way. TODO: We won't want this for SALU especially if it is an inline
14381// immediate.
14382SDValue SITargetLowering::splitBinaryBitConstantOp(
14383 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
14384 const ConstantSDNode *CRHS) const {
14385 uint64_t Val = CRHS->getZExtValue();
14386 uint32_t ValLo = Lo_32(Value: Val);
14387 uint32_t ValHi = Hi_32(Value: Val);
14388 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14389
14390 if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) ||
14391 bitOpWithConstantIsReducible(Opc, Val: ValHi)) ||
14392 (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
14393 // We have 64-bit scalar and/or/xor, but do not have vector forms.
14394 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
14395 !CRHS->user_begin()->isDivergent())
14396 return SDValue();
14397
14398 // If we need to materialize a 64-bit immediate, it will be split up later
14399 // anyway. Avoid creating the harder to understand 64-bit immediate
14400 // materialization.
14401 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
14402 }
14403
14404 return SDValue();
14405}
14406
14407bool llvm::isBoolSGPR(SDValue V) {
14408 if (V.getValueType() != MVT::i1)
14409 return false;
14410 switch (V.getOpcode()) {
14411 default:
14412 break;
14413 case ISD::SETCC:
14414 case ISD::IS_FPCLASS:
14415 case AMDGPUISD::FP_CLASS:
14416 return true;
14417 case ISD::AND:
14418 case ISD::OR:
14419 case ISD::XOR:
14420 return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1));
14421 case ISD::SADDO:
14422 case ISD::UADDO:
14423 case ISD::SSUBO:
14424 case ISD::USUBO:
14425 case ISD::SMULO:
14426 case ISD::UMULO:
14427 return V.getResNo() == 1;
14428 case ISD::INTRINSIC_WO_CHAIN: {
14429 unsigned IntrinsicID = V.getConstantOperandVal(i: 0);
14430 switch (IntrinsicID) {
14431 case Intrinsic::amdgcn_is_shared:
14432 case Intrinsic::amdgcn_is_private:
14433 return true;
14434 default:
14435 return false;
14436 }
14437
14438 return false;
14439 }
14440 }
14441 return false;
14442}
14443
14444// If a constant has all zeroes or all ones within each byte return it.
14445// Otherwise return 0.
14446static uint32_t getConstantPermuteMask(uint32_t C) {
14447 // 0xff for any zero byte in the mask
14448 uint32_t ZeroByteMask = 0;
14449 if (!(C & 0x000000ff))
14450 ZeroByteMask |= 0x000000ff;
14451 if (!(C & 0x0000ff00))
14452 ZeroByteMask |= 0x0000ff00;
14453 if (!(C & 0x00ff0000))
14454 ZeroByteMask |= 0x00ff0000;
14455 if (!(C & 0xff000000))
14456 ZeroByteMask |= 0xff000000;
14457 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
14458 if ((NonZeroByteMask & C) != NonZeroByteMask)
14459 return 0; // Partial bytes selected.
14460 return C;
14461}
14462
14463// Check if a node selects whole bytes from its operand 0 starting at a byte
14464// boundary while masking the rest. Returns select mask as in the v_perm_b32
14465// or -1 if not succeeded.
14466// Note byte select encoding:
14467// value 0-3 selects corresponding source byte;
14468// value 0xc selects zero;
14469// value 0xff selects 0xff.
14470static uint32_t getPermuteMask(SDValue V) {
14471 assert(V.getValueSizeInBits() == 32);
14472
14473 if (V.getNumOperands() != 2)
14474 return ~0;
14475
14476 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1));
14477 if (!N1)
14478 return ~0;
14479
14480 uint32_t C = N1->getZExtValue();
14481
14482 switch (V.getOpcode()) {
14483 default:
14484 break;
14485 case ISD::AND:
14486 if (uint32_t ConstMask = getConstantPermuteMask(C))
14487 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
14488 break;
14489
14490 case ISD::OR:
14491 if (uint32_t ConstMask = getConstantPermuteMask(C))
14492 return (0x03020100 & ~ConstMask) | ConstMask;
14493 break;
14494
14495 case ISD::SHL:
14496 if (C % 8)
14497 return ~0;
14498
14499 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
14500
14501 case ISD::SRL:
14502 if (C % 8)
14503 return ~0;
14504
14505 return uint32_t(0x0c0c0c0c03020100ull >> C);
14506 }
14507
14508 return ~0;
14509}
14510
14511SDValue SITargetLowering::performAndCombine(SDNode *N,
14512 DAGCombinerInfo &DCI) const {
14513 if (DCI.isBeforeLegalize())
14514 return SDValue();
14515
14516 SelectionDAG &DAG = DCI.DAG;
14517 EVT VT = N->getValueType(ResNo: 0);
14518 SDValue LHS = N->getOperand(Num: 0);
14519 SDValue RHS = N->getOperand(Num: 1);
14520
14521 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
14522 if (VT == MVT::i64 && CRHS) {
14523 if (SDValue Split =
14524 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS))
14525 return Split;
14526 }
14527
14528 if (CRHS && VT == MVT::i32) {
14529 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
14530 // nb = number of trailing zeroes in mask
14531 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
14532 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
14533 uint64_t Mask = CRHS->getZExtValue();
14534 unsigned Bits = llvm::popcount(Value: Mask);
14535 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
14536 (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) {
14537 if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) {
14538 unsigned Shift = CShift->getZExtValue();
14539 unsigned NB = CRHS->getAPIntValue().countr_zero();
14540 unsigned Offset = NB + Shift;
14541 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
14542 SDLoc SL(N);
14543 SDValue BFE =
14544 DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32, N1: LHS->getOperand(Num: 0),
14545 N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32),
14546 N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32));
14547 EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
14548 SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
14549 N2: DAG.getValueType(NarrowVT));
14550 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc(LHS), VT, N1: Ext,
14551 N2: DAG.getConstant(Val: NB, DL: SDLoc(CRHS), VT: MVT::i32));
14552 return Shl;
14553 }
14554 }
14555 }
14556
14557 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14558 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
14559 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
14560 uint32_t Sel = getConstantPermuteMask(C: Mask);
14561 if (!Sel)
14562 return SDValue();
14563
14564 // Select 0xc for all zero bytes
14565 Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c);
14566 SDLoc DL(N);
14567 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
14568 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14569 }
14570 }
14571
14572 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
14573 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
14574 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
14575 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
14576 ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get();
14577
14578 SDValue X = LHS.getOperand(i: 0);
14579 SDValue Y = RHS.getOperand(i: 0);
14580 if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X ||
14581 !isTypeLegal(VT: X.getValueType()))
14582 return SDValue();
14583
14584 if (LCC == ISD::SETO) {
14585 if (X != LHS.getOperand(i: 1))
14586 return SDValue();
14587
14588 if (RCC == ISD::SETUNE) {
14589 const ConstantFPSDNode *C1 =
14590 dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1));
14591 if (!C1 || !C1->isInfinity() || C1->isNegative())
14592 return SDValue();
14593
14594 const uint32_t Mask = SIInstrFlags::N_NORMAL |
14595 SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |
14596 SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |
14597 SIInstrFlags::P_NORMAL;
14598
14599 static_assert(
14600 ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |
14601 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &
14602 0x3ff) == Mask,
14603 "mask not equal");
14604
14605 SDLoc DL(N);
14606 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: X,
14607 N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32));
14608 }
14609 }
14610 }
14611
14612 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
14613 std::swap(a&: LHS, b&: RHS);
14614
14615 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14616 RHS.hasOneUse()) {
14617 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
14618 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
14619 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
14620 // | n_nan)
14621 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
14622 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
14623 (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) &&
14624 LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) {
14625 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
14626 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
14627 : Mask->getZExtValue() & OrdMask;
14628
14629 SDLoc DL(N);
14630 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: 0),
14631 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
14632 }
14633 }
14634
14635 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
14636 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
14637 // and x, (sext cc from i1) => select cc, x, 0
14638 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
14639 std::swap(a&: LHS, b&: RHS);
14640 if (isBoolSGPR(V: RHS.getOperand(i: 0)))
14641 return DAG.getSelect(DL: SDLoc(N), VT: MVT::i32, Cond: RHS.getOperand(i: 0), LHS,
14642 RHS: DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32));
14643 }
14644
14645 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14646 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14647 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14648 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
14649 uint32_t LHSMask = getPermuteMask(V: LHS);
14650 uint32_t RHSMask = getPermuteMask(V: RHS);
14651 if (LHSMask != ~0u && RHSMask != ~0u) {
14652 // Canonicalize the expression in an attempt to have fewer unique masks
14653 // and therefore fewer registers used to hold the masks.
14654 if (LHSMask > RHSMask) {
14655 std::swap(a&: LHSMask, b&: RHSMask);
14656 std::swap(a&: LHS, b&: RHS);
14657 }
14658
14659 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14660 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14661 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14662 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14663
14664 // Check of we need to combine values from two sources within a byte.
14665 if (!(LHSUsedLanes & RHSUsedLanes) &&
14666 // If we select high and lower word keep it for SDWA.
14667 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14668 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14669 // Each byte in each mask is either selector mask 0-3, or has higher
14670 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
14671 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
14672 // mask which is not 0xff wins. By anding both masks we have a correct
14673 // result except that 0x0c shall be corrected to give 0x0c only.
14674 uint32_t Mask = LHSMask & RHSMask;
14675 for (unsigned I = 0; I < 32; I += 8) {
14676 uint32_t ByteSel = 0xff << I;
14677 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14678 Mask &= (0x0c << I) & 0xffffffff;
14679 }
14680
14681 // Add 4 to each active LHS lane. It will not affect any existing 0xff
14682 // or 0x0c.
14683 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
14684 SDLoc DL(N);
14685
14686 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
14687 N2: RHS.getOperand(i: 0),
14688 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14689 }
14690 }
14691 }
14692
14693 return SDValue();
14694}
14695
14696// A key component of v_perm is a mapping between byte position of the src
14697// operands, and the byte position of the dest. To provide such, we need: 1. the
14698// node that provides x byte of the dest of the OR, and 2. the byte of the node
14699// used to provide that x byte. calculateByteProvider finds which node provides
14700// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
14701// and finds an ultimate src and byte position For example: The supported
14702// LoadCombine pattern for vector loads is as follows
14703// t1
14704// or
14705// / \
14706// t2 t3
14707// zext shl
14708// | | \
14709// t4 t5 16
14710// or anyext
14711// / \ |
14712// t6 t7 t8
14713// srl shl or
14714// / | / \ / \
14715// t9 t10 t11 t12 t13 t14
14716// trunc* 8 trunc* 8 and and
14717// | | / | | \
14718// t15 t16 t17 t18 t19 t20
14719// trunc* 255 srl -256
14720// | / \
14721// t15 t15 16
14722//
14723// *In this example, the truncs are from i32->i16
14724//
14725// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
14726// respectively. calculateSrcByte would find (given node) -> ultimate src &
14727// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
14728// After finding the mapping, we can combine the tree into vperm t15, t16,
14729// 0x05000407
14730
14731// Find the source and byte position from a node.
14732// \p DestByte is the byte position of the dest of the or that the src
14733// ultimately provides. \p SrcIndex is the byte of the src that maps to this
14734// dest of the or byte. \p Depth tracks how many recursive iterations we have
14735// performed.
14736static const std::optional<ByteProvider<SDValue>>
14737calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
14738 unsigned Depth = 0) {
14739 // We may need to recursively traverse a series of SRLs
14740 if (Depth >= 6)
14741 return std::nullopt;
14742
14743 if (Op.getValueSizeInBits() < 8)
14744 return std::nullopt;
14745
14746 if (Op.getValueType().isVector())
14747 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
14748
14749 switch (Op->getOpcode()) {
14750 case ISD::TRUNCATE: {
14751 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
14752 }
14753
14754 case ISD::ANY_EXTEND:
14755 case ISD::SIGN_EXTEND:
14756 case ISD::ZERO_EXTEND:
14757 case ISD::SIGN_EXTEND_INREG: {
14758 SDValue NarrowOp = Op->getOperand(Num: 0);
14759 auto NarrowVT = NarrowOp.getValueType();
14760 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
14761 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
14762 NarrowVT = VTSign->getVT();
14763 }
14764 if (!NarrowVT.isByteSized())
14765 return std::nullopt;
14766 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
14767
14768 if (SrcIndex >= NarrowByteWidth)
14769 return std::nullopt;
14770 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
14771 }
14772
14773 case ISD::SRA:
14774 case ISD::SRL: {
14775 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14776 if (!ShiftOp)
14777 return std::nullopt;
14778
14779 uint64_t BitShift = ShiftOp->getZExtValue();
14780
14781 if (BitShift % 8 != 0)
14782 return std::nullopt;
14783
14784 uint64_t NewSrcIndex = SrcIndex + BitShift / 8;
14785 if (NewSrcIndex >= Op.getScalarValueSizeInBits() / 8)
14786 return std::nullopt;
14787
14788 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex: NewSrcIndex,
14789 Depth: Depth + 1);
14790 }
14791
14792 default: {
14793 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
14794 }
14795 }
14796 llvm_unreachable("fully handled switch");
14797}
14798
14799// For a byte position in the result of an Or, traverse the tree and find the
14800// node (and the byte of the node) which ultimately provides this {Or,
14801// BytePosition}. \p Op is the operand we are currently examining. \p Index is
14802// the byte position of the Op that corresponds with the originally requested
14803// byte of the Or \p Depth tracks how many recursive iterations we have
14804// performed. \p StartingIndex is the originally requested byte of the Or
14805static const std::optional<ByteProvider<SDValue>>
14806calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
14807 unsigned StartingIndex = 0) {
14808 // Finding Src tree of RHS of or typically requires at least 1 additional
14809 // depth
14810 if (Depth > 6)
14811 return std::nullopt;
14812
14813 unsigned BitWidth = Op.getScalarValueSizeInBits();
14814 if (BitWidth % 8 != 0)
14815 return std::nullopt;
14816 if (Index > BitWidth / 8 - 1)
14817 return std::nullopt;
14818
14819 bool IsVec = Op.getValueType().isVector();
14820 switch (Op.getOpcode()) {
14821 case ISD::OR: {
14822 if (IsVec)
14823 return std::nullopt;
14824
14825 auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1,
14826 StartingIndex);
14827 if (!RHS)
14828 return std::nullopt;
14829 auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
14830 StartingIndex);
14831 if (!LHS)
14832 return std::nullopt;
14833 // A well formed Or will have two ByteProviders for each byte, one of which
14834 // is constant zero
14835 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14836 return std::nullopt;
14837 if (!LHS || LHS->isConstantZero())
14838 return RHS;
14839 if (!RHS || RHS->isConstantZero())
14840 return LHS;
14841 return std::nullopt;
14842 }
14843
14844 case ISD::AND: {
14845 if (IsVec)
14846 return std::nullopt;
14847
14848 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14849 if (!BitMaskOp)
14850 return std::nullopt;
14851
14852 uint32_t BitMask = BitMaskOp->getZExtValue();
14853 // Bits we expect for our StartingIndex
14854 uint32_t IndexMask = 0xFF << (Index * 8);
14855
14856 if ((IndexMask & BitMask) != IndexMask) {
14857 // If the result of the and partially provides the byte, then it
14858 // is not well formatted
14859 if (IndexMask & BitMask)
14860 return std::nullopt;
14861 return ByteProvider<SDValue>::getConstantZero();
14862 }
14863
14864 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index);
14865 }
14866
14867 case ISD::FSHR: {
14868 if (IsVec)
14869 return std::nullopt;
14870
14871 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14872 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
14873 if (!ShiftOp || Op.getValueType().isVector())
14874 return std::nullopt;
14875
14876 uint64_t BitsProvided = Op.getValueSizeInBits();
14877 if (BitsProvided % 8 != 0)
14878 return std::nullopt;
14879
14880 uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
14881 if (BitShift % 8)
14882 return std::nullopt;
14883
14884 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14885 uint64_t ByteShift = BitShift / 8;
14886
14887 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14888 uint64_t BytesProvided = BitsProvided / 8;
14889 SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1);
14890 NewIndex %= BytesProvided;
14891 return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex);
14892 }
14893
14894 case ISD::SRA:
14895 case ISD::SRL: {
14896 if (IsVec)
14897 return std::nullopt;
14898
14899 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14900 if (!ShiftOp)
14901 return std::nullopt;
14902
14903 uint64_t BitShift = ShiftOp->getZExtValue();
14904 if (BitShift % 8)
14905 return std::nullopt;
14906
14907 auto BitsProvided = Op.getScalarValueSizeInBits();
14908 if (BitsProvided % 8 != 0)
14909 return std::nullopt;
14910
14911 uint64_t BytesProvided = BitsProvided / 8;
14912 uint64_t ByteShift = BitShift / 8;
14913 if (Index + ByteShift < BytesProvided)
14914 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex,
14915 SrcIndex: Index + ByteShift);
14916 // SRA's out-of-range bytes are sign bits, not constant zero.
14917 if (Op.getOpcode() == ISD::SRA)
14918 return std::nullopt;
14919 return ByteProvider<SDValue>::getConstantZero();
14920 }
14921
14922 case ISD::SHL: {
14923 if (IsVec)
14924 return std::nullopt;
14925
14926 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14927 if (!ShiftOp)
14928 return std::nullopt;
14929
14930 uint64_t BitShift = ShiftOp->getZExtValue();
14931 if (BitShift % 8 != 0)
14932 return std::nullopt;
14933 uint64_t ByteShift = BitShift / 8;
14934
14935 // If we are shifting by an amount greater than (or equal to)
14936 // the index we are trying to provide, then it provides 0s. If not,
14937 // then this bytes are not definitively 0s, and the corresponding byte
14938 // of interest is Index - ByteShift of the src
14939 return Index < ByteShift
14940 ? ByteProvider<SDValue>::getConstantZero()
14941 : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift,
14942 Depth: Depth + 1, StartingIndex);
14943 }
14944 case ISD::ANY_EXTEND:
14945 case ISD::SIGN_EXTEND:
14946 case ISD::ZERO_EXTEND:
14947 case ISD::SIGN_EXTEND_INREG:
14948 case ISD::AssertZext:
14949 case ISD::AssertSext: {
14950 if (IsVec)
14951 return std::nullopt;
14952
14953 SDValue NarrowOp = Op->getOperand(Num: 0);
14954 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14955 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14956 Op->getOpcode() == ISD::AssertZext ||
14957 Op->getOpcode() == ISD::AssertSext) {
14958 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
14959 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14960 }
14961 if (NarrowBitWidth % 8 != 0)
14962 return std::nullopt;
14963 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14964
14965 if (Index >= NarrowByteWidth)
14966 return Op.getOpcode() == ISD::ZERO_EXTEND
14967 ? std::optional<ByteProvider<SDValue>>(
14968 ByteProvider<SDValue>::getConstantZero())
14969 : std::nullopt;
14970 return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex);
14971 }
14972
14973 case ISD::TRUNCATE: {
14974 if (IsVec)
14975 return std::nullopt;
14976
14977 uint64_t NarrowByteWidth = BitWidth / 8;
14978
14979 if (NarrowByteWidth >= Index) {
14980 return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
14981 StartingIndex);
14982 }
14983
14984 return std::nullopt;
14985 }
14986
14987 case ISD::CopyFromReg: {
14988 if (BitWidth / 8 > Index)
14989 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14990
14991 return std::nullopt;
14992 }
14993
14994 case ISD::LOAD: {
14995 auto *L = cast<LoadSDNode>(Val: Op.getNode());
14996
14997 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14998 if (NarrowBitWidth % 8 != 0)
14999 return std::nullopt;
15000 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
15001
15002 // If the width of the load does not reach byte we are trying to provide for
15003 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
15004 // question
15005 if (Index >= NarrowByteWidth) {
15006 return L->getExtensionType() == ISD::ZEXTLOAD
15007 ? std::optional<ByteProvider<SDValue>>(
15008 ByteProvider<SDValue>::getConstantZero())
15009 : std::nullopt;
15010 }
15011
15012 if (NarrowByteWidth > Index) {
15013 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
15014 }
15015
15016 return std::nullopt;
15017 }
15018
15019 case ISD::BSWAP: {
15020 if (IsVec)
15021 return std::nullopt;
15022
15023 return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1,
15024 Depth: Depth + 1, StartingIndex);
15025 }
15026
15027 case ISD::EXTRACT_VECTOR_ELT: {
15028 auto *IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
15029 if (!IdxOp)
15030 return std::nullopt;
15031 auto VecIdx = IdxOp->getZExtValue();
15032 auto ScalarSize = Op.getScalarValueSizeInBits();
15033 if (ScalarSize < 32)
15034 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
15035 return calculateSrcByte(Op: ScalarSize >= 32 ? Op : Op.getOperand(i: 0),
15036 DestByte: StartingIndex, SrcIndex: Index);
15037 }
15038
15039 case AMDGPUISD::PERM: {
15040 if (IsVec)
15041 return std::nullopt;
15042
15043 auto *PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
15044 if (!PermMask)
15045 return std::nullopt;
15046
15047 auto IdxMask =
15048 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
15049 if (IdxMask > 0x07 && IdxMask != 0x0c)
15050 return std::nullopt;
15051
15052 auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1);
15053 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
15054
15055 return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
15056 : ByteProvider<SDValue>(
15057 ByteProvider<SDValue>::getConstantZero());
15058 }
15059
15060 default: {
15061 return std::nullopt;
15062 }
15063 }
15064
15065 llvm_unreachable("fully handled switch");
15066}
15067
15068// Returns true if the Operand is a scalar and is 16 bits
15069static bool isExtendedFrom16Bits(SDValue &Operand) {
15070
15071 switch (Operand.getOpcode()) {
15072 case ISD::ANY_EXTEND:
15073 case ISD::SIGN_EXTEND:
15074 case ISD::ZERO_EXTEND: {
15075 auto OpVT = Operand.getOperand(i: 0).getValueType();
15076 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
15077 }
15078 case ISD::LOAD: {
15079 LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
15080 auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
15081 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
15082 ExtType == ISD::EXTLOAD) {
15083 auto MemVT = L->getMemoryVT();
15084 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
15085 }
15086 return L->getMemoryVT().getSizeInBits() == 16;
15087 }
15088 default:
15089 return false;
15090 }
15091}
15092
15093// Returns true if the mask matches consecutive bytes, and the first byte
15094// begins at a power of 2 byte offset from 0th byte
15095static bool addresses16Bits(int Mask) {
15096 int Low8 = Mask & 0xff;
15097 int Hi8 = (Mask & 0xff00) >> 8;
15098
15099 assert(Low8 < 8 && Hi8 < 8);
15100 // Are the bytes contiguous in the order of increasing addresses.
15101 bool IsConsecutive = (Hi8 - Low8 == 1);
15102 // Is the first byte at location that is aligned for 16 bit instructions.
15103 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
15104 // In this case, we still need code to extract the 16 bit operand, so it
15105 // is better to use i8 v_perm
15106 bool Is16Aligned = !(Low8 % 2);
15107
15108 return IsConsecutive && Is16Aligned;
15109}
15110
15111// Do not lower into v_perm if the operands are actually 16 bit
15112// and the selected bits (based on PermMask) correspond with two
15113// easily addressable 16 bit operands.
15114static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
15115 SDValue &OtherOp) {
15116 int Low16 = PermMask & 0xffff;
15117 int Hi16 = (PermMask & 0xffff0000) >> 16;
15118
15119 auto TempOp = peekThroughBitcasts(V: Op);
15120 auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
15121
15122 auto OpIs16Bit =
15123 TempOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp);
15124 if (!OpIs16Bit)
15125 return true;
15126
15127 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
15128 isExtendedFrom16Bits(Operand&: TempOtherOp);
15129 if (!OtherOpIs16Bit)
15130 return true;
15131
15132 // Do we cleanly address both
15133 return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16);
15134}
15135
15136static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
15137 unsigned DWordOffset) {
15138 SDValue Ret;
15139
15140 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
15141 // ByteProvider must be at least 8 bits
15142 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
15143
15144 if (TypeSize <= 32)
15145 return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32);
15146
15147 if (Src.getValueType().isVector()) {
15148 auto ScalarTySize = Src.getScalarValueSizeInBits();
15149 auto ScalarTy = Src.getValueType().getScalarType();
15150 if (ScalarTySize == 32) {
15151 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src,
15152 N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32));
15153 }
15154 if (ScalarTySize > 32) {
15155 Ret = DAG.getNode(
15156 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src,
15157 N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / 32), DL: SL, VT: MVT::i32));
15158 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
15159 if (ShiftVal)
15160 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret,
15161 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
15162 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
15163 }
15164
15165 assert(ScalarTySize < 32);
15166 auto NumElements = TypeSize / ScalarTySize;
15167 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
15168 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
15169 auto NumElementsIn32 = 32 / ScalarTySize;
15170 auto NumAvailElements = DWordOffset < Trunc32Elements
15171 ? NumElementsIn32
15172 : NumElements - NormalizedTrunc;
15173
15174 SmallVector<SDValue, 4> VecSrcs;
15175 DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
15176 Count: NumAvailElements);
15177
15178 Ret = DAG.getBuildVector(
15179 VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
15180 Ops: VecSrcs);
15181 return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
15182 }
15183
15184 /// Scalar Type
15185 auto ShiftVal = 32 * DWordOffset;
15186 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src,
15187 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
15188 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
15189}
15190
15191static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
15192 SelectionDAG &DAG = DCI.DAG;
15193 [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0);
15194 SmallVector<ByteProvider<SDValue>, 8> PermNodes;
15195
15196 // VT is known to be MVT::i32, so we need to provide 4 bytes.
15197 assert(VT == MVT::i32);
15198 for (int i = 0; i < 4; i++) {
15199 // Find the ByteProvider that provides the ith byte of the result of OR
15200 std::optional<ByteProvider<SDValue>> P =
15201 calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i);
15202 // TODO support constantZero
15203 if (!P || P->isConstantZero())
15204 return SDValue();
15205
15206 PermNodes.push_back(Elt: *P);
15207 }
15208 if (PermNodes.size() != 4)
15209 return SDValue();
15210
15211 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
15212 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
15213 uint64_t PermMask = 0x00000000;
15214 for (size_t i = 0; i < PermNodes.size(); i++) {
15215 auto PermOp = PermNodes[i];
15216 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
15217 // by sizeof(Src2) = 4
15218 int SrcByteAdjust = 4;
15219
15220 // If the Src uses a byte from a different DWORD, then it corresponds
15221 // with a difference source
15222 if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) ||
15223 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
15224 if (SecondSrc)
15225 if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) ||
15226 ((PermOp.SrcOffset / 4) != SecondSrc->second))
15227 return SDValue();
15228
15229 // Set the index of the second distinct Src node
15230 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
15231 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
15232 SrcByteAdjust = 0;
15233 }
15234 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
15235 assert(!DAG.getDataLayout().isBigEndian());
15236 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
15237 }
15238 SDLoc DL(N);
15239 SDValue Op = *PermNodes[FirstSrc.first].Src;
15240 Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
15241 assert(Op.getValueSizeInBits() == 32);
15242
15243 // Check that we are not just extracting the bytes in order from an op
15244 if (!SecondSrc) {
15245 int Low16 = PermMask & 0xffff;
15246 int Hi16 = (PermMask & 0xffff0000) >> 16;
15247
15248 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
15249 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
15250
15251 // The perm op would really just produce Op. So combine into Op
15252 if (WellFormedLow && WellFormedHi)
15253 return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op);
15254 }
15255
15256 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
15257
15258 if (SecondSrc) {
15259 OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second);
15260 assert(OtherOp.getValueSizeInBits() == 32);
15261 }
15262
15263 // Check that we haven't just recreated the same FSHR node.
15264 if (N->getOpcode() == ISD::FSHR &&
15265 (N->getOperand(Num: 0) == Op || N->getOperand(Num: 0) == OtherOp) &&
15266 (N->getOperand(Num: 1) == Op || N->getOperand(Num: 1) == OtherOp))
15267 return SDValue();
15268
15269 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
15270
15271 assert(Op.getValueType().isByteSized() &&
15272 OtherOp.getValueType().isByteSized());
15273
15274 // If the ultimate src is less than 32 bits, then we will only be
15275 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
15276 // CalculateByteProvider would not have returned Op as source if we
15277 // used a byte that is outside its ValueType. Thus, we are free to
15278 // ANY_EXTEND as the extended bits are dont-cares.
15279 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32);
15280 OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32);
15281
15282 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp,
15283 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
15284 }
15285 return SDValue();
15286}
15287
15288SDValue SITargetLowering::performOrCombine(SDNode *N,
15289 DAGCombinerInfo &DCI) const {
15290 SelectionDAG &DAG = DCI.DAG;
15291 SDValue LHS = N->getOperand(Num: 0);
15292 SDValue RHS = N->getOperand(Num: 1);
15293
15294 EVT VT = N->getValueType(ResNo: 0);
15295 if (VT == MVT::i1) {
15296 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
15297 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
15298 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
15299 SDValue Src = LHS.getOperand(i: 0);
15300 if (Src != RHS.getOperand(i: 0))
15301 return SDValue();
15302
15303 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
15304 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
15305 if (!CLHS || !CRHS)
15306 return SDValue();
15307
15308 // Only 10 bits are used.
15309 static const uint32_t MaxMask = 0x3ff;
15310
15311 uint32_t NewMask =
15312 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
15313 SDLoc DL(N);
15314 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: Src,
15315 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
15316 }
15317
15318 return SDValue();
15319 }
15320
15321 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
15322 if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
15323 LHS.getOpcode() == AMDGPUISD::PERM &&
15324 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
15325 uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1));
15326 if (!Sel)
15327 return SDValue();
15328
15329 Sel |= LHS.getConstantOperandVal(i: 2);
15330 SDLoc DL(N);
15331 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
15332 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
15333 }
15334
15335 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
15336 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15337 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
15338 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
15339
15340 // If all the uses of an or need to extract the individual elements, do not
15341 // attempt to lower into v_perm
15342 auto usesCombinedOperand = [](SDNode *OrUse) {
15343 // If we have any non-vectorized use, then it is a candidate for v_perm
15344 if (OrUse->getOpcode() != ISD::BITCAST ||
15345 !OrUse->getValueType(ResNo: 0).isVector())
15346 return true;
15347
15348 // If we have any non-vectorized use, then it is a candidate for v_perm
15349 for (auto *VUser : OrUse->users()) {
15350 if (!VUser->getValueType(ResNo: 0).isVector())
15351 return true;
15352
15353 // If the use of a vector is a store, then combining via a v_perm
15354 // is beneficial.
15355 // TODO -- whitelist more uses
15356 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
15357 if (VUser->getOpcode() == VectorwiseOp)
15358 return true;
15359 }
15360 return false;
15361 };
15362
15363 if (!any_of(Range: N->users(), P: usesCombinedOperand))
15364 return SDValue();
15365
15366 uint32_t LHSMask = getPermuteMask(V: LHS);
15367 uint32_t RHSMask = getPermuteMask(V: RHS);
15368
15369 if (LHSMask != ~0u && RHSMask != ~0u) {
15370 // Canonicalize the expression in an attempt to have fewer unique masks
15371 // and therefore fewer registers used to hold the masks.
15372 if (LHSMask > RHSMask) {
15373 std::swap(a&: LHSMask, b&: RHSMask);
15374 std::swap(a&: LHS, b&: RHS);
15375 }
15376
15377 // Select 0xc for each lane used from source operand. Zero has 0xc mask
15378 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
15379 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15380 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15381
15382 // Check of we need to combine values from two sources within a byte.
15383 if (!(LHSUsedLanes & RHSUsedLanes) &&
15384 // If we select high and lower word keep it for SDWA.
15385 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
15386 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
15387 // Kill zero bytes selected by other mask. Zero value is 0xc.
15388 LHSMask &= ~RHSUsedLanes;
15389 RHSMask &= ~LHSUsedLanes;
15390 // Add 4 to each active LHS lane
15391 LHSMask |= LHSUsedLanes & 0x04040404;
15392 // Combine masks
15393 uint32_t Sel = LHSMask | RHSMask;
15394 SDLoc DL(N);
15395
15396 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
15397 N2: RHS.getOperand(i: 0),
15398 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
15399 }
15400 }
15401 if (LHSMask == ~0u || RHSMask == ~0u) {
15402 if (SDValue Perm = matchPERM(N, DCI))
15403 return Perm;
15404 }
15405 }
15406
15407 // Detect identity v2i32 OR and replace with identity source node.
15408 // Specifically an Or that has operands constructed from the same source node
15409 // via extract_vector_elt and build_vector. I.E.
15410 // v2i32 or(
15411 // v2i32 build_vector(
15412 // i32 extract_elt(%IdentitySrc, 0),
15413 // i32 0
15414 // ),
15415 // v2i32 build_vector(
15416 // i32 0,
15417 // i32 extract_elt(%IdentitySrc, 1)
15418 // ) )
15419 // =>
15420 // v2i32 %IdentitySrc
15421
15422 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
15423 RHS->getOpcode() == ISD::BUILD_VECTOR) {
15424
15425 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1));
15426 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(Val: RHS->getOperand(Num: 0));
15427
15428 // Test for and normalise build vectors.
15429 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
15430
15431 // Get the extract_vector_element operands.
15432 SDValue LEVE = LHS->getOperand(Num: 0);
15433 SDValue REVE = RHS->getOperand(Num: 1);
15434
15435 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15436 REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
15437 // Check that different elements from the same vector are
15438 // extracted.
15439 if (LEVE->getOperand(Num: 0) == REVE->getOperand(Num: 0) &&
15440 LEVE->getOperand(Num: 1) != REVE->getOperand(Num: 1)) {
15441 SDValue IdentitySrc = LEVE.getOperand(i: 0);
15442 return IdentitySrc;
15443 }
15444 }
15445 }
15446 }
15447
15448 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
15449 return SDValue();
15450
15451 // TODO: This could be a generic combine with a predicate for extracting the
15452 // high half of an integer being free.
15453
15454 // (or i64:x, (zero_extend i32:y)) ->
15455 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
15456 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
15457 RHS.getOpcode() != ISD::ZERO_EXTEND)
15458 std::swap(a&: LHS, b&: RHS);
15459
15460 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
15461 SDValue ExtSrc = RHS.getOperand(i: 0);
15462 EVT SrcVT = ExtSrc.getValueType();
15463 if (SrcVT == MVT::i32) {
15464 SDLoc SL(N);
15465 auto [LowLHS, HiBits] = split64BitValue(Op: LHS, DAG);
15466 SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc);
15467
15468 DCI.AddToWorklist(N: LowOr.getNode());
15469 DCI.AddToWorklist(N: HiBits.getNode());
15470
15471 SDValue Vec =
15472 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: LowOr, N2: HiBits);
15473 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
15474 }
15475 }
15476
15477 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
15478 if (CRHS) {
15479 if (SDValue Split = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR,
15480 LHS: N->getOperand(Num: 0), CRHS))
15481 return Split;
15482 }
15483
15484 return SDValue();
15485}
15486
15487SDValue SITargetLowering::performXorCombine(SDNode *N,
15488 DAGCombinerInfo &DCI) const {
15489 if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
15490 return RV;
15491
15492 SDValue LHS = N->getOperand(Num: 0);
15493 SDValue RHS = N->getOperand(Num: 1);
15494
15495 const ConstantSDNode *CRHS = isConstOrConstSplat(N: RHS);
15496 SelectionDAG &DAG = DCI.DAG;
15497
15498 EVT VT = N->getValueType(ResNo: 0);
15499 if (CRHS && VT == MVT::i64) {
15500 if (SDValue Split =
15501 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS))
15502 return Split;
15503 }
15504
15505 // v2i32 (xor (vselect cc, x, y), K) ->
15506 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
15507 // replaced with source modifiers when the select is lowered to CNDMASK.
15508 unsigned Opc = LHS.getOpcode();
15509 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
15510 (Opc == ISD::SELECT && VT == MVT::i64)) &&
15511 CRHS && CRHS->getAPIntValue().isSignMask()) {
15512 SDValue CC = LHS->getOperand(Num: 0);
15513 SDValue TRUE = LHS->getOperand(Num: 1);
15514 SDValue FALSE = LHS->getOperand(Num: 2);
15515 SDValue XTrue = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT, N1: TRUE, N2: RHS);
15516 SDValue XFalse = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT, N1: FALSE, N2: RHS);
15517 SDValue XSelect =
15518 DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT, N1: CC, N2: XTrue, N3: XFalse);
15519 return XSelect;
15520 }
15521
15522 // Make sure to apply the 64-bit constant splitting fold before trying to fold
15523 // fneg-like xors into 64-bit select.
15524 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
15525 // This looks like an fneg, try to fold as a source modifier.
15526 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
15527 shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
15528 // xor (select c, a, b), 0x80000000 ->
15529 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
15530 SDLoc DL(N);
15531 SDValue CastLHS =
15532 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 1));
15533 SDValue CastRHS =
15534 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 2));
15535 SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS);
15536 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS);
15537 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32,
15538 N1: LHS->getOperand(Num: 0), N2: FNegLHS, N3: FNegRHS);
15539 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
15540 }
15541 }
15542
15543 return SDValue();
15544}
15545
15546SDValue
15547SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
15548 DAGCombinerInfo &DCI) const {
15549 if (!Subtarget->has16BitInsts() ||
15550 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
15551 return SDValue();
15552
15553 EVT VT = N->getValueType(ResNo: 0);
15554 if (VT != MVT::i32)
15555 return SDValue();
15556
15557 SDValue Src = N->getOperand(Num: 0);
15558 if (Src.getValueType() != MVT::i16)
15559 return SDValue();
15560
15561 if (!Src->hasOneUse())
15562 return SDValue();
15563
15564 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
15565 // possible we're missing out on some combine opportunities, but we'd need to
15566 // weigh the cost of extracting the byte from the upper dwords.
15567
15568 std::optional<ByteProvider<SDValue>> BP0 =
15569 calculateByteProvider(Op: SDValue(N, 0), Index: 0, Depth: 0, StartingIndex: 0);
15570 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
15571 return SDValue();
15572 SDValue V0 = *BP0->Src;
15573
15574 std::optional<ByteProvider<SDValue>> BP1 =
15575 calculateByteProvider(Op: SDValue(N, 0), Index: 1, Depth: 0, StartingIndex: 1);
15576 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
15577 return SDValue();
15578
15579 SDValue V1 = *BP1->Src;
15580
15581 if (V0 == V1)
15582 return SDValue();
15583
15584 SelectionDAG &DAG = DCI.DAG;
15585 SDLoc DL(N);
15586 uint32_t PermMask = 0x0c0c0c0c;
15587 if (V0) {
15588 V0 = DAG.getBitcastedAnyExtOrTrunc(Op: V0, DL, VT: MVT::i32);
15589 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
15590 }
15591
15592 if (V1) {
15593 V1 = DAG.getBitcastedAnyExtOrTrunc(Op: V1, DL, VT: MVT::i32);
15594 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
15595 }
15596
15597 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: V0, N2: V1,
15598 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
15599}
15600
15601SDValue
15602SITargetLowering::performSignExtendInRegCombine(SDNode *N,
15603 DAGCombinerInfo &DCI) const {
15604 SDValue Src = N->getOperand(Num: 0);
15605 auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1));
15606
15607 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
15608 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
15609 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
15610 VTSign->getVT() == MVT::i8) ||
15611 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
15612 VTSign->getVT() == MVT::i16))) {
15613 assert(Subtarget->hasScalarSubwordLoads() &&
15614 "s_buffer_load_{u8, i8} are supported "
15615 "in GFX12 (or newer) architectures.");
15616 EVT VT = Src.getValueType();
15617 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
15618 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15619 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15620 SDLoc DL(N);
15621 SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32);
15622 SDValue Ops[] = {
15623 Src.getOperand(i: 0), // source register
15624 Src.getOperand(i: 1), // offset
15625 Src.getOperand(i: 2) // cachePolicy
15626 };
15627 auto *M = cast<MemSDNode>(Val&: Src);
15628 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15629 Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
15630 SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
15631 return LoadVal;
15632 }
15633 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15634 VTSign->getVT() == MVT::i8) ||
15635 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15636 VTSign->getVT() == MVT::i16)) &&
15637 Src.hasOneUse()) {
15638 auto *M = cast<MemSDNode>(Val&: Src);
15639 SDValue Ops[] = {Src.getOperand(i: 0), // Chain
15640 Src.getOperand(i: 1), // rsrc
15641 Src.getOperand(i: 2), // vindex
15642 Src.getOperand(i: 3), // voffset
15643 Src.getOperand(i: 4), // soffset
15644 Src.getOperand(i: 5), // offset
15645 Src.getOperand(i: 6), Src.getOperand(i: 7)};
15646 // replace with BUFFER_LOAD_BYTE/SHORT
15647 SDVTList ResList =
15648 DCI.DAG.getVTList(VT1: MVT::i32, VT2: Src.getOperand(i: 0).getValueType());
15649 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15650 ? AMDGPUISD::BUFFER_LOAD_BYTE
15651 : AMDGPUISD::BUFFER_LOAD_SHORT;
15652 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15653 Opcode: Opc, dl: SDLoc(N), VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
15654 return DCI.DAG.getMergeValues(
15655 Ops: {BufferLoadSignExt, BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N));
15656 }
15657 return SDValue();
15658}
15659
15660SDValue SITargetLowering::performClassCombine(SDNode *N,
15661 DAGCombinerInfo &DCI) const {
15662 SelectionDAG &DAG = DCI.DAG;
15663 SDValue Mask = N->getOperand(Num: 1);
15664
15665 // fp_class x, 0 -> false
15666 if (isNullConstant(V: Mask))
15667 return DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1);
15668
15669 if (N->getOperand(Num: 0).isUndef())
15670 return DAG.getUNDEF(VT: MVT::i1);
15671
15672 return SDValue();
15673}
15674
15675SDValue SITargetLowering::performRcpCombine(SDNode *N,
15676 DAGCombinerInfo &DCI) const {
15677 EVT VT = N->getValueType(ResNo: 0);
15678 SDValue N0 = N->getOperand(Num: 0);
15679
15680 if (N0.isUndef()) {
15681 return DCI.DAG.getConstantFP(Val: APFloat::getQNaN(Sem: VT.getFltSemantics()),
15682 DL: SDLoc(N), VT);
15683 }
15684
15685 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
15686 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
15687 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
15688 return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT, Operand: N0.getOperand(i: 0),
15689 Flags: N->getFlags());
15690 }
15691
15692 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
15693}
15694
15695bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
15696 SDNodeFlags UserFlags,
15697 unsigned MaxDepth) const {
15698 unsigned Opcode = Op.getOpcode();
15699 if (Opcode == ISD::FCANONICALIZE)
15700 return true;
15701
15702 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
15703 const auto &F = CFP->getValueAPF();
15704 if (F.isNaN() && F.isSignaling())
15705 return false;
15706 if (!F.isDenormal())
15707 return true;
15708
15709 DenormalMode Mode =
15710 DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
15711 return Mode == DenormalMode::getIEEE();
15712 }
15713
15714 // If source is a result of another standard FP operation it is already in
15715 // canonical form.
15716 if (MaxDepth == 0)
15717 return false;
15718
15719 switch (Opcode) {
15720 // These will flush denorms if required.
15721 case ISD::FADD:
15722 case ISD::FSUB:
15723 case ISD::FMUL:
15724 case ISD::FCEIL:
15725 case ISD::FFLOOR:
15726 case ISD::FMA:
15727 case ISD::FMAD:
15728 case ISD::FSQRT:
15729 case ISD::FDIV:
15730 case ISD::FREM:
15731 case ISD::FP_ROUND:
15732 case ISD::FP_EXTEND:
15733 case ISD::FP16_TO_FP:
15734 case ISD::FP_TO_FP16:
15735 case ISD::BF16_TO_FP:
15736 case ISD::FP_TO_BF16:
15737 case ISD::FLDEXP:
15738 case AMDGPUISD::FMUL_LEGACY:
15739 case AMDGPUISD::FMAD_FTZ:
15740 case AMDGPUISD::RCP:
15741 case AMDGPUISD::RSQ:
15742 case AMDGPUISD::RSQ_CLAMP:
15743 case AMDGPUISD::RCP_LEGACY:
15744 case AMDGPUISD::RCP_IFLAG:
15745 case AMDGPUISD::LOG:
15746 case AMDGPUISD::EXP:
15747 case AMDGPUISD::DIV_SCALE:
15748 case AMDGPUISD::DIV_FMAS:
15749 case AMDGPUISD::DIV_FIXUP:
15750 case AMDGPUISD::FRACT:
15751 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15752 case AMDGPUISD::CVT_F32_UBYTE0:
15753 case AMDGPUISD::CVT_F32_UBYTE1:
15754 case AMDGPUISD::CVT_F32_UBYTE2:
15755 case AMDGPUISD::CVT_F32_UBYTE3:
15756 case AMDGPUISD::FP_TO_FP16:
15757 case AMDGPUISD::SIN_HW:
15758 case AMDGPUISD::COS_HW:
15759 return true;
15760
15761 // It can/will be lowered or combined as a bit operation.
15762 // Need to check their input recursively to handle.
15763 case ISD::FNEG:
15764 case ISD::FABS:
15765 case ISD::FCOPYSIGN:
15766 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
15767
15768 case ISD::AND:
15769 if (Op.getValueType() == MVT::i32) {
15770 // Be careful as we only know it is a bitcast floating point type. It
15771 // could be f32, v2f16, we have no way of knowing. Luckily the constant
15772 // value that we optimize for, which comes up in fp32 to bf16 conversions,
15773 // is valid to optimize for all types.
15774 if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
15775 if (RHS->getZExtValue() == 0xffff0000) {
15776 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
15777 }
15778 }
15779 }
15780 break;
15781
15782 case ISD::FSIN:
15783 case ISD::FCOS:
15784 case ISD::FSINCOS:
15785 return Op.getValueType().getScalarType() != MVT::f16;
15786
15787 case ISD::FMINNUM:
15788 case ISD::FMAXNUM:
15789 case ISD::FMINNUM_IEEE:
15790 case ISD::FMAXNUM_IEEE:
15791 case ISD::FMINIMUM:
15792 case ISD::FMAXIMUM:
15793 case ISD::FMINIMUMNUM:
15794 case ISD::FMAXIMUMNUM:
15795 case AMDGPUISD::CLAMP:
15796 case AMDGPUISD::FMED3:
15797 case AMDGPUISD::FMAX3:
15798 case AMDGPUISD::FMIN3:
15799 case AMDGPUISD::FMAXIMUM3:
15800 case AMDGPUISD::FMINIMUM3: {
15801 // FIXME: Shouldn't treat the generic operations different based these.
15802 // However, we aren't really required to flush the result from
15803 // minnum/maxnum..
15804
15805 // snans will be quieted, so we only need to worry about denormals.
15806 if (Subtarget->supportsMinMaxDenormModes() ||
15807 // FIXME: denormalsEnabledForType is broken for dynamic
15808 denormalsEnabledForType(DAG, VT: Op.getValueType()))
15809 return true;
15810
15811 // Flushing may be required.
15812 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
15813 // targets need to check their input recursively.
15814
15815 // FIXME: Does this apply with clamp? It's implemented with max.
15816 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15817 if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), UserFlags: MaxDepth - 1))
15818 return false;
15819 }
15820
15821 return true;
15822 }
15823 case ISD::SELECT: {
15824 return isCanonicalized(DAG, Op: Op.getOperand(i: 1), UserFlags: MaxDepth - 1) &&
15825 isCanonicalized(DAG, Op: Op.getOperand(i: 2), UserFlags: MaxDepth - 1);
15826 }
15827 case ISD::BUILD_VECTOR: {
15828 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15829 SDValue SrcOp = Op.getOperand(i);
15830 if (!isCanonicalized(DAG, Op: SrcOp, UserFlags: MaxDepth - 1))
15831 return false;
15832 }
15833
15834 return true;
15835 }
15836 case ISD::EXTRACT_VECTOR_ELT:
15837 case ISD::EXTRACT_SUBVECTOR: {
15838 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
15839 }
15840 case ISD::INSERT_VECTOR_ELT: {
15841 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1) &&
15842 isCanonicalized(DAG, Op: Op.getOperand(i: 1), UserFlags: MaxDepth - 1);
15843 }
15844 case ISD::UNDEF:
15845 // Could be anything.
15846 return false;
15847
15848 case ISD::BITCAST:
15849 // TODO: This is incorrect as it loses track of the operand's type. We may
15850 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15851 // same bits that are canonicalized in one type need not be in the other.
15852 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
15853 case ISD::TRUNCATE: {
15854 // Hack round the mess we make when legalizing extract_vector_elt
15855 if (Op.getValueType() == MVT::i16) {
15856 SDValue TruncSrc = Op.getOperand(i: 0);
15857 if (TruncSrc.getValueType() == MVT::i32 &&
15858 TruncSrc.getOpcode() == ISD::BITCAST &&
15859 TruncSrc.getOperand(i: 0).getValueType() == MVT::v2f16) {
15860 return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), UserFlags: MaxDepth - 1);
15861 }
15862 }
15863 return false;
15864 }
15865 case ISD::INTRINSIC_WO_CHAIN: {
15866 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
15867 // TODO: Handle more intrinsics
15868 switch (IntrinsicID) {
15869 case Intrinsic::amdgcn_cvt_pkrtz:
15870 case Intrinsic::amdgcn_cubeid:
15871 case Intrinsic::amdgcn_frexp_mant:
15872 case Intrinsic::amdgcn_fdot2:
15873 case Intrinsic::amdgcn_rcp:
15874 case Intrinsic::amdgcn_rsq:
15875 case Intrinsic::amdgcn_rsq_clamp:
15876 case Intrinsic::amdgcn_rcp_legacy:
15877 case Intrinsic::amdgcn_rsq_legacy:
15878 case Intrinsic::amdgcn_trig_preop:
15879 case Intrinsic::amdgcn_tanh:
15880 case Intrinsic::amdgcn_log:
15881 case Intrinsic::amdgcn_exp2:
15882 case Intrinsic::amdgcn_sqrt:
15883 return true;
15884 default:
15885 break;
15886 }
15887
15888 break;
15889 }
15890 default:
15891 break;
15892 }
15893
15894 // FIXME: denormalsEnabledForType is broken for dynamic
15895 return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
15896 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15897}
15898
15899bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
15900 unsigned MaxDepth) const {
15901 const MachineRegisterInfo &MRI = MF.getRegInfo();
15902 MachineInstr *MI = MRI.getVRegDef(Reg);
15903 unsigned Opcode = MI->getOpcode();
15904
15905 if (Opcode == AMDGPU::G_FCANONICALIZE)
15906 return true;
15907
15908 std::optional<FPValueAndVReg> FCR;
15909 // Constant splat (can be padded with undef) or scalar constant.
15910 if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
15911 if (FCR->Value.isSignaling())
15912 return false;
15913 if (!FCR->Value.isDenormal())
15914 return true;
15915
15916 DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics());
15917 return Mode == DenormalMode::getIEEE();
15918 }
15919
15920 if (MaxDepth == 0)
15921 return false;
15922
15923 switch (Opcode) {
15924 case AMDGPU::G_FADD:
15925 case AMDGPU::G_FSUB:
15926 case AMDGPU::G_FMUL:
15927 case AMDGPU::G_FCEIL:
15928 case AMDGPU::G_FFLOOR:
15929 case AMDGPU::G_FRINT:
15930 case AMDGPU::G_FNEARBYINT:
15931 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15932 case AMDGPU::G_INTRINSIC_TRUNC:
15933 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15934 case AMDGPU::G_FMA:
15935 case AMDGPU::G_FMAD:
15936 case AMDGPU::G_FSQRT:
15937 case AMDGPU::G_FDIV:
15938 case AMDGPU::G_FREM:
15939 case AMDGPU::G_FPOW:
15940 case AMDGPU::G_FPEXT:
15941 case AMDGPU::G_FLOG:
15942 case AMDGPU::G_FLOG2:
15943 case AMDGPU::G_FLOG10:
15944 case AMDGPU::G_FPTRUNC:
15945 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15946 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15947 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15948 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15949 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15950 return true;
15951 case AMDGPU::G_FNEG:
15952 case AMDGPU::G_FABS:
15953 case AMDGPU::G_FCOPYSIGN:
15954 return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1);
15955 case AMDGPU::G_FMINNUM:
15956 case AMDGPU::G_FMAXNUM:
15957 case AMDGPU::G_FMINNUM_IEEE:
15958 case AMDGPU::G_FMAXNUM_IEEE:
15959 case AMDGPU::G_FMINIMUM:
15960 case AMDGPU::G_FMAXIMUM:
15961 case AMDGPU::G_FMINIMUMNUM:
15962 case AMDGPU::G_FMAXIMUMNUM: {
15963 if (Subtarget->supportsMinMaxDenormModes() ||
15964 // FIXME: denormalsEnabledForType is broken for dynamic
15965 denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
15966 return true;
15967
15968 [[fallthrough]];
15969 }
15970 case AMDGPU::G_BUILD_VECTOR:
15971 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
15972 if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1))
15973 return false;
15974 return true;
15975 case AMDGPU::G_INTRINSIC:
15976 case AMDGPU::G_INTRINSIC_CONVERGENT:
15977 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
15978 case Intrinsic::amdgcn_fmul_legacy:
15979 case Intrinsic::amdgcn_fmad_ftz:
15980 case Intrinsic::amdgcn_sqrt:
15981 case Intrinsic::amdgcn_fmed3:
15982 case Intrinsic::amdgcn_sin:
15983 case Intrinsic::amdgcn_cos:
15984 case Intrinsic::amdgcn_log:
15985 case Intrinsic::amdgcn_exp2:
15986 case Intrinsic::amdgcn_log_clamp:
15987 case Intrinsic::amdgcn_rcp:
15988 case Intrinsic::amdgcn_rcp_legacy:
15989 case Intrinsic::amdgcn_rsq:
15990 case Intrinsic::amdgcn_rsq_clamp:
15991 case Intrinsic::amdgcn_rsq_legacy:
15992 case Intrinsic::amdgcn_div_scale:
15993 case Intrinsic::amdgcn_div_fmas:
15994 case Intrinsic::amdgcn_div_fixup:
15995 case Intrinsic::amdgcn_fract:
15996 case Intrinsic::amdgcn_cvt_pkrtz:
15997 case Intrinsic::amdgcn_cubeid:
15998 case Intrinsic::amdgcn_cubema:
15999 case Intrinsic::amdgcn_cubesc:
16000 case Intrinsic::amdgcn_cubetc:
16001 case Intrinsic::amdgcn_frexp_mant:
16002 case Intrinsic::amdgcn_fdot2:
16003 case Intrinsic::amdgcn_trig_preop:
16004 case Intrinsic::amdgcn_tanh:
16005 return true;
16006 default:
16007 break;
16008 }
16009
16010 [[fallthrough]];
16011 default:
16012 return false;
16013 }
16014
16015 llvm_unreachable("invalid operation");
16016}
16017
16018// Constant fold canonicalize.
16019SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
16020 const SDLoc &SL, EVT VT,
16021 const APFloat &C) const {
16022 // Flush denormals to 0 if not enabled.
16023 if (C.isDenormal()) {
16024 DenormalMode Mode =
16025 DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
16026 if (Mode == DenormalMode::getPreserveSign()) {
16027 return DAG.getConstantFP(
16028 Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
16029 }
16030
16031 if (Mode != DenormalMode::getIEEE())
16032 return SDValue();
16033 }
16034
16035 if (C.isNaN()) {
16036 APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
16037 if (C.isSignaling()) {
16038 // Quiet a signaling NaN.
16039 // FIXME: Is this supposed to preserve payload bits?
16040 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
16041 }
16042
16043 // Make sure it is the canonical NaN bitpattern.
16044 //
16045 // TODO: Can we use -1 as the canonical NaN value since it's an inline
16046 // immediate?
16047 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
16048 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
16049 }
16050
16051 // Already canonical.
16052 return DAG.getConstantFP(Val: C, DL: SL, VT);
16053}
16054
16055static bool vectorEltWillFoldAway(SDValue Op) {
16056 return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op);
16057}
16058
16059SDValue
16060SITargetLowering::performFCanonicalizeCombine(SDNode *N,
16061 DAGCombinerInfo &DCI) const {
16062 SelectionDAG &DAG = DCI.DAG;
16063 SDValue N0 = N->getOperand(Num: 0);
16064 EVT VT = N->getValueType(ResNo: 0);
16065
16066 // fcanonicalize undef -> qnan
16067 if (N0.isUndef()) {
16068 APFloat QNaN = APFloat::getQNaN(Sem: VT.getFltSemantics());
16069 return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT);
16070 }
16071
16072 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
16073 EVT VT = N->getValueType(ResNo: 0);
16074 return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF());
16075 }
16076
16077 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
16078 // (fcanonicalize k)
16079 //
16080 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
16081
16082 // TODO: This could be better with wider vectors that will be split to v2f16,
16083 // and to consider uses since there aren't that many packed operations.
16084 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
16085 isTypeLegal(VT: MVT::v2f16)) {
16086 SDLoc SL(N);
16087 SDValue NewElts[2];
16088 SDValue Lo = N0.getOperand(i: 0);
16089 SDValue Hi = N0.getOperand(i: 1);
16090 EVT EltVT = Lo.getValueType();
16091
16092 if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) {
16093 for (unsigned I = 0; I != 2; ++I) {
16094 SDValue Op = N0.getOperand(i: I);
16095 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
16096 NewElts[I] =
16097 getCanonicalConstantFP(DAG, SL, VT: EltVT, C: CFP->getValueAPF());
16098 } else if (Op.isUndef()) {
16099 // Handled below based on what the other operand is.
16100 NewElts[I] = Op;
16101 } else {
16102 NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
16103 }
16104 }
16105
16106 // If one half is undef, and one is constant, prefer a splat vector rather
16107 // than the normal qNaN. If it's a register, prefer 0.0 since that's
16108 // cheaper to use and may be free with a packed operation.
16109 if (NewElts[0].isUndef()) {
16110 if (isa<ConstantFPSDNode>(Val: NewElts[1]))
16111 NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1])
16112 ? NewElts[1]
16113 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
16114 }
16115
16116 if (NewElts[1].isUndef()) {
16117 NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0])
16118 ? NewElts[0]
16119 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
16120 }
16121
16122 return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
16123 }
16124 }
16125
16126 return SDValue();
16127}
16128
16129static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
16130 switch (Opc) {
16131 case ISD::FMAXNUM:
16132 case ISD::FMAXNUM_IEEE:
16133 case ISD::FMAXIMUMNUM:
16134 return AMDGPUISD::FMAX3;
16135 case ISD::FMAXIMUM:
16136 return AMDGPUISD::FMAXIMUM3;
16137 case ISD::SMAX:
16138 return AMDGPUISD::SMAX3;
16139 case ISD::UMAX:
16140 return AMDGPUISD::UMAX3;
16141 case ISD::FMINNUM:
16142 case ISD::FMINNUM_IEEE:
16143 case ISD::FMINIMUMNUM:
16144 return AMDGPUISD::FMIN3;
16145 case ISD::FMINIMUM:
16146 return AMDGPUISD::FMINIMUM3;
16147 case ISD::SMIN:
16148 return AMDGPUISD::SMIN3;
16149 case ISD::UMIN:
16150 return AMDGPUISD::UMIN3;
16151 default:
16152 llvm_unreachable("Not a min/max opcode");
16153 }
16154}
16155
16156SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
16157 const SDLoc &SL, SDValue Src,
16158 SDValue MinVal,
16159 SDValue MaxVal,
16160 bool Signed) const {
16161
16162 // med3 comes from
16163 // min(max(x, K0), K1), K0 < K1
16164 // max(min(x, K0), K1), K1 < K0
16165 //
16166 // "MinVal" and "MaxVal" respectively refer to the rhs of the
16167 // min/max op.
16168 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
16169 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
16170
16171 if (!MinK || !MaxK)
16172 return SDValue();
16173
16174 if (Signed) {
16175 if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
16176 return SDValue();
16177 } else {
16178 if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
16179 return SDValue();
16180 }
16181
16182 EVT VT = MinK->getValueType(ResNo: 0);
16183 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
16184 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
16185 return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
16186
16187 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
16188 // not available, but this is unlikely to be profitable as constants
16189 // will often need to be materialized & extended, especially on
16190 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
16191 return SDValue();
16192}
16193
16194static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
16195 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
16196 return C;
16197
16198 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
16199 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
16200 return C;
16201 }
16202
16203 return nullptr;
16204}
16205
16206SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
16207 const SDLoc &SL, SDValue Op0,
16208 SDValue Op1,
16209 bool IsKnownNoNaNs) const {
16210 ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
16211 if (!K1)
16212 return SDValue();
16213
16214 ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1));
16215 if (!K0)
16216 return SDValue();
16217
16218 // Ordered >= (although NaN inputs should have folded away by now).
16219 if (K0->getValueAPF() > K1->getValueAPF())
16220 return SDValue();
16221
16222 // med3 with a nan input acts like
16223 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
16224 //
16225 // So the result depends on whether the IEEE mode bit is enabled or not with a
16226 // signaling nan input.
16227 // ieee=1
16228 // s0 snan: yields s2
16229 // s1 snan: yields s2
16230 // s2 snan: qnan
16231
16232 // s0 qnan: min(s1, s2)
16233 // s1 qnan: min(s0, s2)
16234 // s2 qnan: min(s0, s1)
16235
16236 // ieee=0
16237 // s0 snan: min(s1, s2)
16238 // s1 snan: min(s0, s2)
16239 // s2 snan: qnan
16240
16241 // s0 qnan: min(s1, s2)
16242 // s1 qnan: min(s0, s2)
16243 // s2 qnan: min(s0, s1)
16244 const MachineFunction &MF = DAG.getMachineFunction();
16245 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16246
16247 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
16248 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
16249 // can only form if op0 is fmaxnum_ieee if IEEE=1.
16250 EVT VT = Op0.getValueType();
16251 if (Info->getMode().DX10Clamp) {
16252 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
16253 // hardware fmed3 behavior converting to a min.
16254 // FIXME: Should this be allowing -0.0?
16255 if (K1->isOne() && K0->isPosZero())
16256 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0));
16257 }
16258
16259 // med3 for f16 is only available on gfx9+, and not available for v2f16.
16260 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
16261 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
16262 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
16263 // then give the other result, which is different from med3 with a NaN
16264 // input.
16265 SDValue Var = Op0.getOperand(i: 0);
16266 if (!IsKnownNoNaNs && !DAG.isKnownNeverSNaN(Op: Var))
16267 return SDValue();
16268
16269 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16270
16271 if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) &&
16272 (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) {
16273 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0), N1: Var,
16274 N2: SDValue(K0, 0), N3: SDValue(K1, 0));
16275 }
16276 }
16277
16278 return SDValue();
16279}
16280
16281/// \return true if the subtarget supports minimum3 and maximum3 with the given
16282/// base min/max opcode \p Opc for type \p VT.
16283static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
16284 EVT VT) {
16285 switch (Opc) {
16286 case ISD::FMINNUM:
16287 case ISD::FMAXNUM:
16288 case ISD::FMINNUM_IEEE:
16289 case ISD::FMAXNUM_IEEE:
16290 case ISD::FMINIMUMNUM:
16291 case ISD::FMAXIMUMNUM:
16292 case AMDGPUISD::FMIN_LEGACY:
16293 case AMDGPUISD::FMAX_LEGACY:
16294 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
16295 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
16296 case ISD::FMINIMUM:
16297 case ISD::FMAXIMUM:
16298 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
16299 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
16300 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
16301 case ISD::SMAX:
16302 case ISD::SMIN:
16303 case ISD::UMAX:
16304 case ISD::UMIN:
16305 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
16306 default:
16307 return false;
16308 }
16309
16310 llvm_unreachable("not a min/max opcode");
16311}
16312
16313SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
16314 DAGCombinerInfo &DCI) const {
16315 SelectionDAG &DAG = DCI.DAG;
16316
16317 EVT VT = N->getValueType(ResNo: 0);
16318 unsigned Opc = N->getOpcode();
16319 SDValue Op0 = N->getOperand(Num: 0);
16320 SDValue Op1 = N->getOperand(Num: 1);
16321
16322 // Only do this if the inner op has one use since this will just increases
16323 // register pressure for no benefit.
16324
16325 if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) {
16326 auto IsTreeWithCombinableChildren = [Opc](SDValue Op) {
16327 return (Op.getOperand(i: 0).getOpcode() == Opc &&
16328 Op.getOperand(i: 0).hasOneUse()) ||
16329 (Op.getOperand(i: 1).getOpcode() == Opc &&
16330 Op.getOperand(i: 1).hasOneUse());
16331 };
16332
16333 bool CanTreeCombineApply = Op0.getOpcode() == Opc && Op0.hasOneUse() &&
16334 Op1.getOpcode() == Opc && Op1.hasOneUse();
16335 bool HasCombinableTreeChild =
16336 CanTreeCombineApply && (IsTreeWithCombinableChildren(Op0) ||
16337 IsTreeWithCombinableChildren(Op1));
16338
16339 // Tree reduction: when both operands are the same min/max op, restructure
16340 // to keep a 2-op node on top so higher tree levels can still combine.
16341 //
16342 // max(max(a, b), max(c, d)) -> max(max3(a, b, c), d)
16343 // min(min(a, b), min(c, d)) -> min(min3(a, b, c), d)
16344 //
16345 // Defer when either inner op is a tree node with combinable children.
16346 if (CanTreeCombineApply && !HasCombinableTreeChild) {
16347 SDLoc DL(N);
16348 SDValue Inner =
16349 DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT, N1: Op0.getOperand(i: 0),
16350 N2: Op0.getOperand(i: 1), N3: Op1.getOperand(i: 0));
16351 return DAG.getNode(Opcode: Opc, DL, VT, N1: Inner, N2: Op1.getOperand(i: 1));
16352 }
16353
16354 // max(max(a, b), c) -> max3(a, b, c)
16355 // min(min(a, b), c) -> min3(a, b, c)
16356 // Deferred when Op0 is a tree node with combinable children.
16357 if (Op0.getOpcode() == Opc && Op0.hasOneUse() && !HasCombinableTreeChild) {
16358 SDLoc DL(N);
16359 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
16360 N1: Op0.getOperand(i: 0), N2: Op0.getOperand(i: 1), N3: Op1);
16361 }
16362
16363 // Try commuted.
16364 // max(a, max(b, c)) -> max3(a, b, c)
16365 // min(a, min(b, c)) -> min3(a, b, c)
16366 // Deferred when Op1 is a tree node with combinable children.
16367 if (Op1.getOpcode() == Opc && Op1.hasOneUse() && !HasCombinableTreeChild) {
16368 SDLoc DL(N);
16369 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
16370 N1: Op0, N2: Op1.getOperand(i: 0), N3: Op1.getOperand(i: 1));
16371 }
16372 }
16373
16374 // umin(sffbh(x), bitwidth) -> sffbh(x) if x is known to be not 0 or -1.
16375 SDValue FfbhSrc;
16376 uint64_t Clamp = 0;
16377 if (Opc == ISD::UMIN &&
16378 sd_match(N: Op0,
16379 P: m_IntrinsicWOChain<Intrinsic::amdgcn_sffbh>(Opnds: m_Value(N&: FfbhSrc))) &&
16380 sd_match(N: Op1, P: m_ConstInt(V&: Clamp))) {
16381 unsigned BitWidth = FfbhSrc.getValueType().getScalarSizeInBits();
16382 if (Clamp >= BitWidth) {
16383 KnownBits Known = DAG.computeKnownBits(Op: FfbhSrc);
16384 if (Known.isNonZero() && Known.Zero.getBoolValue())
16385 return Op0;
16386 }
16387 }
16388
16389 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
16390 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
16391 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
16392 if (SDValue Med3 = performIntMed3ImmCombine(
16393 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true))
16394 return Med3;
16395 }
16396 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
16397 if (SDValue Med3 = performIntMed3ImmCombine(
16398 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true))
16399 return Med3;
16400 }
16401
16402 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
16403 if (SDValue Med3 = performIntMed3ImmCombine(
16404 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false))
16405 return Med3;
16406 }
16407 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
16408 if (SDValue Med3 = performIntMed3ImmCombine(
16409 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false))
16410 return Med3;
16411 }
16412
16413 // if !is_snan(x):
16414 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16415 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16416 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16417 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16418 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
16419 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
16420 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
16421 (Opc == AMDGPUISD::FMIN_LEGACY &&
16422 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
16423 (VT == MVT::f32 || VT == MVT::f64 ||
16424 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
16425 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
16426 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
16427 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
16428 Op0.hasOneUse()) {
16429 if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1,
16430 IsKnownNoNaNs: N->getFlags().hasNoNaNs()))
16431 return Res;
16432 }
16433
16434 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
16435 // for some types, but at a higher cost since it's implemented with a 3
16436 // operand form.
16437 const SDNodeFlags Flags = N->getFlags();
16438 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
16439 !Subtarget->hasIEEEMinimumMaximumInsts() &&
16440 isOperationLegal(Op: ISD::FMINNUM_IEEE, VT: VT.getScalarType())) {
16441 unsigned NewOpc =
16442 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
16443 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(N), VT, N1: Op0, N2: Op1, Flags);
16444 }
16445
16446 return SDValue();
16447}
16448
16449static bool isClampZeroToOne(SDValue A, SDValue B) {
16450 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
16451 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
16452 // FIXME: Should this be allowing -0.0?
16453 return (CA->isPosZero() && CB->isOne()) ||
16454 (CA->isOne() && CB->isPosZero());
16455 }
16456 }
16457
16458 return false;
16459}
16460
16461// FIXME: Should only worry about snans for version with chain.
16462SDValue SITargetLowering::performFMed3Combine(SDNode *N,
16463 DAGCombinerInfo &DCI) const {
16464 EVT VT = N->getValueType(ResNo: 0);
16465 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
16466 // NaNs. With a NaN input, the order of the operands may change the result.
16467
16468 SelectionDAG &DAG = DCI.DAG;
16469 SDLoc SL(N);
16470
16471 SDValue Src0 = N->getOperand(Num: 0);
16472 SDValue Src1 = N->getOperand(Num: 1);
16473 SDValue Src2 = N->getOperand(Num: 2);
16474
16475 if (isClampZeroToOne(A: Src0, B: Src1)) {
16476 // const_a, const_b, x -> clamp is safe in all cases including signaling
16477 // nans.
16478 // FIXME: Should this be allowing -0.0?
16479 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
16480 }
16481
16482 const MachineFunction &MF = DAG.getMachineFunction();
16483 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16484
16485 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
16486 // handling no dx10-clamp?
16487 if (Info->getMode().DX10Clamp) {
16488 // If NaNs is clamped to 0, we are free to reorder the inputs.
16489
16490 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
16491 std::swap(a&: Src0, b&: Src1);
16492
16493 if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
16494 std::swap(a&: Src1, b&: Src2);
16495
16496 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
16497 std::swap(a&: Src0, b&: Src1);
16498
16499 if (isClampZeroToOne(A: Src1, B: Src2))
16500 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
16501 }
16502
16503 return SDValue();
16504}
16505
16506SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
16507 DAGCombinerInfo &DCI) const {
16508 SDValue Src0 = N->getOperand(Num: 0);
16509 SDValue Src1 = N->getOperand(Num: 1);
16510 if (Src0.isUndef() && Src1.isUndef())
16511 return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
16512 return SDValue();
16513}
16514
16515// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
16516// expanded into a set of cmp/select instructions.
16517bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
16518 unsigned NumElem,
16519 bool IsDivergentIdx,
16520 const GCNSubtarget *Subtarget) {
16521 if (UseDivergentRegisterIndexing)
16522 return false;
16523
16524 unsigned VecSize = EltSize * NumElem;
16525
16526 // Sub-dword vectors of size 2 dword or less have better implementation.
16527 if (VecSize <= 64 && EltSize < 32)
16528 return false;
16529
16530 // Always expand the rest of sub-dword instructions, otherwise it will be
16531 // lowered via memory.
16532 if (EltSize < 32)
16533 return true;
16534
16535 // Always do this if var-idx is divergent, otherwise it will become a loop.
16536 if (IsDivergentIdx)
16537 return true;
16538
16539 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
16540 unsigned NumInsts = NumElem /* Number of compares */ +
16541 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
16542
16543 // On some architectures (GFX9) movrel is not available and it's better
16544 // to expand.
16545 if (Subtarget->useVGPRIndexMode())
16546 return NumInsts <= 16;
16547
16548 // If movrel is available, use it instead of expanding for vector of 8
16549 // elements.
16550 if (Subtarget->hasMovrel())
16551 return NumInsts <= 15;
16552
16553 return true;
16554}
16555
16556bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
16557 SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1);
16558 if (isa<ConstantSDNode>(Val: Idx))
16559 return false;
16560
16561 SDValue Vec = N->getOperand(Num: 0);
16562 EVT VecVT = Vec.getValueType();
16563 EVT EltVT = VecVT.getVectorElementType();
16564 unsigned EltSize = EltVT.getSizeInBits();
16565 unsigned NumElem = VecVT.getVectorNumElements();
16566
16567 return SITargetLowering::shouldExpandVectorDynExt(
16568 EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget());
16569}
16570
16571SDValue
16572SITargetLowering::performExtractVectorEltCombine(SDNode *N,
16573 DAGCombinerInfo &DCI) const {
16574 SDValue Vec = N->getOperand(Num: 0);
16575 SelectionDAG &DAG = DCI.DAG;
16576
16577 EVT VecVT = Vec.getValueType();
16578 EVT VecEltVT = VecVT.getVectorElementType();
16579 EVT ResVT = N->getValueType(ResNo: 0);
16580
16581 unsigned VecSize = VecVT.getSizeInBits();
16582 unsigned VecEltSize = VecEltVT.getSizeInBits();
16583
16584 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
16585 allUsesHaveSourceMods(N)) {
16586 SDLoc SL(N);
16587 SDValue Idx = N->getOperand(Num: 1);
16588 SDValue Elt =
16589 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx);
16590 return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
16591 }
16592
16593 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
16594 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
16595 // There are optimisations to transform 64-bit shifts into 32-bit shifts
16596 // depending on the shift operand. See e.g. performSraCombine().
16597 // This combine ensures that the optimisation is compatible with v2i32
16598 // legalised AND.
16599 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
16600 Vec->getOperand(Num: 1)->getOpcode() == ISD::BUILD_VECTOR) {
16601
16602 const ConstantSDNode *C = isConstOrConstSplat(N: Vec.getOperand(i: 1));
16603 if (!C || C->getZExtValue() != 0x1f)
16604 return SDValue();
16605
16606 SDLoc SL(N);
16607 SDValue AndMask = DAG.getConstant(Val: 0x1f, DL: SL, VT: MVT::i32);
16608 SDValue EVE = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32,
16609 N1: Vec->getOperand(Num: 0), N2: N->getOperand(Num: 1));
16610 SDValue A = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: EVE, N2: AndMask);
16611 DAG.ReplaceAllUsesWith(From: N, To: A.getNode());
16612 }
16613
16614 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
16615 // =>
16616 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
16617 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
16618 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
16619 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
16620 SDLoc SL(N);
16621 SDValue Idx = N->getOperand(Num: 1);
16622 unsigned Opc = Vec.getOpcode();
16623
16624 switch (Opc) {
16625 default:
16626 break;
16627 // TODO: Support other binary operations.
16628 case ISD::FADD:
16629 case ISD::FSUB:
16630 case ISD::FMUL:
16631 case ISD::ADD:
16632 case ISD::UMIN:
16633 case ISD::UMAX:
16634 case ISD::SMIN:
16635 case ISD::SMAX:
16636 case ISD::FMAXNUM:
16637 case ISD::FMINNUM:
16638 case ISD::FMAXNUM_IEEE:
16639 case ISD::FMINNUM_IEEE:
16640 case ISD::FMAXIMUM:
16641 case ISD::FMINIMUM: {
16642 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
16643 N1: Vec.getOperand(i: 0), N2: Idx);
16644 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
16645 N1: Vec.getOperand(i: 1), N2: Idx);
16646
16647 DCI.AddToWorklist(N: Elt0.getNode());
16648 DCI.AddToWorklist(N: Elt1.getNode());
16649 return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags());
16650 }
16651 }
16652 }
16653
16654 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
16655 if (shouldExpandVectorDynExt(N)) {
16656 SDLoc SL(N);
16657 SDValue Idx = N->getOperand(Num: 1);
16658 SDValue V;
16659 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16660 SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
16661 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
16662 if (I == 0)
16663 V = Elt;
16664 else
16665 V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
16666 }
16667 return V;
16668 }
16669
16670 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
16671 // =>
16672 // i32:Lo(k) if Idx == 0, or
16673 // i32:Hi(k) if Idx == 1
16674 auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
16675 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
16676 SDLoc SL(N);
16677 SDValue PeekThrough = Vec.getOperand(i: 0);
16678 auto *KImm = dyn_cast<ConstantSDNode>(Val&: PeekThrough);
16679 if (KImm && KImm->getValueType(ResNo: 0).getSizeInBits() == 64) {
16680 uint64_t KImmValue = KImm->getZExtValue();
16681 return DAG.getConstant(
16682 Val: (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, DL: SL, VT: MVT::i32);
16683 }
16684 auto *KFPImm = dyn_cast<ConstantFPSDNode>(Val&: PeekThrough);
16685 if (KFPImm && KFPImm->getValueType(ResNo: 0).getSizeInBits() == 64) {
16686 uint64_t KFPImmValue =
16687 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16688 return DAG.getConstant(Val: (KFPImmValue >> (32 * Idx->getZExtValue())) &
16689 0xffffffff,
16690 DL: SL, VT: MVT::i32);
16691 }
16692 }
16693
16694 if (!DCI.isBeforeLegalize())
16695 return SDValue();
16696
16697 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
16698 // elements. This exposes more load reduction opportunities by replacing
16699 // multiple small extract_vector_elements with a single 32-bit extract.
16700 if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
16701 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16702 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
16703
16704 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16705 unsigned EltIdx = BitIndex / 32;
16706 unsigned LeftoverBitIdx = BitIndex % 32;
16707 SDLoc SL(N);
16708
16709 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
16710 DCI.AddToWorklist(N: Cast.getNode());
16711
16712 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast,
16713 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
16714 DCI.AddToWorklist(N: Elt.getNode());
16715 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt,
16716 N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32));
16717 DCI.AddToWorklist(N: Srl.getNode());
16718
16719 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
16720 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
16721 DCI.AddToWorklist(N: Trunc.getNode());
16722
16723 if (VecEltVT == ResVT) {
16724 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
16725 }
16726
16727 assert(ResVT.isScalarInteger());
16728 return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
16729 }
16730
16731 return SDValue();
16732}
16733
16734SDValue
16735SITargetLowering::performInsertVectorEltCombine(SDNode *N,
16736 DAGCombinerInfo &DCI) const {
16737 SDValue Vec = N->getOperand(Num: 0);
16738 SDValue Idx = N->getOperand(Num: 2);
16739 EVT VecVT = Vec.getValueType();
16740 EVT EltVT = VecVT.getVectorElementType();
16741
16742 // INSERT_VECTOR_ELT (<n x e>, var-idx)
16743 // => BUILD_VECTOR n x select (e, const-idx)
16744 if (!shouldExpandVectorDynExt(N))
16745 return SDValue();
16746
16747 SelectionDAG &DAG = DCI.DAG;
16748 SDLoc SL(N);
16749 SDValue Ins = N->getOperand(Num: 1);
16750 EVT IdxVT = Idx.getValueType();
16751
16752 SmallVector<SDValue, 16> Ops;
16753 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16754 SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
16755 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
16756 SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
16757 Ops.push_back(Elt: V);
16758 }
16759
16760 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
16761}
16762
16763/// Return the source of an fp_extend from f16 to f32, or a converted FP
16764/// constant.
16765static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
16766 if (Src.getOpcode() == ISD::FP_EXTEND &&
16767 Src.getOperand(i: 0).getValueType() == MVT::f16) {
16768 return Src.getOperand(i: 0);
16769 }
16770
16771 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
16772 APFloat Val = CFP->getValueAPF();
16773 bool LosesInfo = true;
16774 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
16775 if (!LosesInfo)
16776 return DAG.getConstantFP(Val, DL: SDLoc(Src), VT: MVT::f16);
16777 }
16778
16779 return SDValue();
16780}
16781
16782SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
16783 DAGCombinerInfo &DCI) const {
16784 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16785 "combine only useful on gfx8");
16786
16787 SDValue TruncSrc = N->getOperand(Num: 0);
16788 EVT VT = N->getValueType(ResNo: 0);
16789 if (VT != MVT::f16)
16790 return SDValue();
16791
16792 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
16793 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
16794 return SDValue();
16795
16796 SelectionDAG &DAG = DCI.DAG;
16797 SDLoc SL(N);
16798
16799 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
16800 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
16801 // casting back.
16802
16803 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
16804 // fmin(fmax(a, b), fmax(fmin(a, b), c))
16805 SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0));
16806 if (!A)
16807 return SDValue();
16808
16809 SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1));
16810 if (!B)
16811 return SDValue();
16812
16813 SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2));
16814 if (!C)
16815 return SDValue();
16816
16817 // This changes signaling nan behavior. If an input is a signaling nan, it
16818 // would have been quieted by the fpext originally. We don't care because
16819 // these are unconstrained ops. If we needed to insert quieting canonicalizes
16820 // we would be worse off than just doing the promotion.
16821 SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
16822 SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
16823 SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
16824 return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
16825}
16826
16827unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
16828 const SDNode *N0,
16829 const SDNode *N1) const {
16830 EVT VT = N0->getValueType(ResNo: 0);
16831
16832 // Only do this if we are not trying to support denormals. v_mad_f32 does not
16833 // support denormals ever.
16834 if (((VT == MVT::f32 &&
16835 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) ||
16836 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16837 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) &&
16838 isOperationLegal(Op: ISD::FMAD, VT))
16839 return ISD::FMAD;
16840
16841 const TargetOptions &Options = DAG.getTarget().Options;
16842 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
16843 (N0->getFlags().hasAllowContract() &&
16844 N1->getFlags().hasAllowContract())) &&
16845 isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
16846 return ISD::FMA;
16847 }
16848
16849 return 0;
16850}
16851
16852// For a reassociatable opcode perform:
16853// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
16854SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
16855 SelectionDAG &DAG) const {
16856 EVT VT = N->getValueType(ResNo: 0);
16857 if (VT != MVT::i32 && VT != MVT::i64)
16858 return SDValue();
16859
16860 if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0)))
16861 return SDValue();
16862
16863 unsigned Opc = N->getOpcode();
16864 SDValue Op0 = N->getOperand(Num: 0);
16865 SDValue Op1 = N->getOperand(Num: 1);
16866
16867 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
16868 return SDValue();
16869
16870 if (Op0->isDivergent())
16871 std::swap(a&: Op0, b&: Op1);
16872
16873 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
16874 return SDValue();
16875
16876 SDValue Op2 = Op1.getOperand(i: 1);
16877 Op1 = Op1.getOperand(i: 0);
16878 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
16879 return SDValue();
16880
16881 if (Op1->isDivergent())
16882 std::swap(a&: Op1, b&: Op2);
16883
16884 SDLoc SL(N);
16885 SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
16886 return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
16887}
16888
16889static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16890 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16891 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
16892 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1);
16893 SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
16894 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
16895}
16896
16897// Fold
16898// y = lshr i64 x, 32
16899// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16900// with Const.hi == -1
16901// To
16902// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16903static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
16904 SDValue MulLHS, SDValue MulRHS,
16905 SDValue AddRHS) {
16906 if (MulRHS.getOpcode() == ISD::SRL)
16907 std::swap(a&: MulLHS, b&: MulRHS);
16908
16909 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16910 return SDValue();
16911
16912 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(Val: MulLHS.getOperand(i: 1));
16913 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16914 MulLHS.getOperand(i: 0) != AddRHS)
16915 return SDValue();
16916
16917 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: MulRHS.getNode());
16918 if (!Const || Hi_32(Value: Const->getZExtValue()) != uint32_t(-1))
16919 return SDValue();
16920
16921 SDValue ConstMul =
16922 DAG.getConstant(Val: Lo_32(Value: Const->getZExtValue()), DL: SL, VT: MVT::i32);
16923 return getMad64_32(DAG, SL, VT: MVT::i64,
16924 N0: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS), N1: ConstMul,
16925 N2: DAG.getZeroExtendInReg(Op: AddRHS, DL: SL, VT: MVT::i32), Signed: false);
16926}
16927
16928// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16929// multiplies, if any.
16930//
16931// Full 64-bit multiplies that feed into an addition are lowered here instead
16932// of using the generic expansion. The generic expansion ends up with
16933// a tree of ADD nodes that prevents us from using the "add" part of the
16934// MAD instruction. The expansion produced here results in a chain of ADDs
16935// instead of a tree.
16936SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16937 DAGCombinerInfo &DCI) const {
16938 assert(N->isAnyAdd());
16939
16940 SelectionDAG &DAG = DCI.DAG;
16941 EVT VT = N->getValueType(ResNo: 0);
16942 SDLoc SL(N);
16943 SDValue LHS = N->getOperand(Num: 0);
16944 SDValue RHS = N->getOperand(Num: 1);
16945
16946 if (VT.isVector())
16947 return SDValue();
16948
16949 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16950 // result in scalar registers for uniform values.
16951 if (!N->isDivergent() && Subtarget->hasSMulHi())
16952 return SDValue();
16953
16954 unsigned NumBits = VT.getScalarSizeInBits();
16955 if (NumBits <= 32 || NumBits > 64)
16956 return SDValue();
16957
16958 if (LHS.getOpcode() != ISD::MUL) {
16959 assert(RHS.getOpcode() == ISD::MUL);
16960 std::swap(a&: LHS, b&: RHS);
16961 }
16962
16963 // Avoid the fold if it would unduly increase the number of multiplies due to
16964 // multiple uses, except on hardware with full-rate multiply-add (which is
16965 // part of full-rate 64-bit ops).
16966 if (!Subtarget->hasFullRate64Ops()) {
16967 unsigned NumUsers = 0;
16968 for (SDNode *User : LHS->users()) {
16969 // There is a use that does not feed into addition, so the multiply can't
16970 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16971 if (!User->isAnyAdd())
16972 return SDValue();
16973
16974 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16975 // MUL + 3xADD + 3xADDC over 3xMAD.
16976 ++NumUsers;
16977 if (NumUsers >= 3)
16978 return SDValue();
16979 }
16980 }
16981
16982 SDValue MulLHS = LHS.getOperand(i: 0);
16983 SDValue MulRHS = LHS.getOperand(i: 1);
16984 SDValue AddRHS = RHS;
16985
16986 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16987 return FoldedMAD;
16988
16989 // Always check whether operands are small unsigned values, since that
16990 // knowledge is useful in more cases. Check for small signed values only if
16991 // doing so can unlock a shorter code sequence.
16992 bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32;
16993 bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32;
16994
16995 bool MulSignedLo = false;
16996 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16997 MulSignedLo =
16998 numBitsSigned(Op: MulLHS, DAG) <= 32 && numBitsSigned(Op: MulRHS, DAG) <= 32;
16999 }
17000
17001 // The operands and final result all have the same number of bits. If
17002 // operands need to be extended, they can be extended with garbage. The
17003 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
17004 // truncated away in the end.
17005 if (VT != MVT::i64) {
17006 MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS);
17007 MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS);
17008 AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS);
17009 }
17010
17011 // The basic code generated is conceptually straightforward. Pseudo code:
17012 //
17013 // accum = mad_64_32 lhs.lo, rhs.lo, accum
17014 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
17015 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
17016 //
17017 // The second and third lines are optional, depending on whether the factors
17018 // are {sign,zero}-extended or not.
17019 //
17020 // The actual DAG is noisier than the pseudo code, but only due to
17021 // instructions that disassemble values into low and high parts, and
17022 // assemble the final result.
17023 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
17024
17025 auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS);
17026 auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS);
17027 SDValue Accum =
17028 getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo);
17029
17030 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
17031 auto [AccumLo, AccumHi] = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32);
17032
17033 if (!MulLHSUnsigned32) {
17034 auto MulLHSHi =
17035 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One);
17036 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo);
17037 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
17038 }
17039
17040 if (!MulRHSUnsigned32) {
17041 auto MulRHSHi =
17042 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One);
17043 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi);
17044 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
17045 }
17046
17047 Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi});
17048 Accum = DAG.getBitcast(VT: MVT::i64, V: Accum);
17049 }
17050
17051 if (VT != MVT::i64)
17052 Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
17053 return Accum;
17054}
17055
17056SDValue
17057SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
17058 DAGCombinerInfo &DCI) const {
17059 SDValue RHS = N->getOperand(Num: 1);
17060 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
17061 if (!CRHS)
17062 return SDValue();
17063
17064 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
17065 // common.
17066 uint64_t Val = CRHS->getZExtValue();
17067 if (countr_zero(Val) >= 32) {
17068 SelectionDAG &DAG = DCI.DAG;
17069 SDLoc SL(N);
17070 SDValue LHS = N->getOperand(Num: 0);
17071
17072 // Avoid carry machinery if we know the low half of the add does not
17073 // contribute to the final result.
17074 //
17075 // add i64:x, K if computeTrailingZeros(K) >= 32
17076 // => build_pair (add x.hi, K.hi), x.lo
17077
17078 // Breaking the 64-bit add here with this strange constant is unlikely
17079 // to interfere with addressing mode patterns.
17080
17081 SDValue Hi = getHiHalf64(Op: LHS, DAG);
17082 SDValue ConstHi32 = DAG.getConstant(Val: Hi_32(Value: Val), DL: SL, VT: MVT::i32);
17083 unsigned Opcode = N->getOpcode();
17084 if (Opcode == ISD::PTRADD)
17085 Opcode = ISD::ADD;
17086 SDValue AddHi =
17087 DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Hi, N2: ConstHi32, Flags: N->getFlags());
17088
17089 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: LHS);
17090 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: Lo, N2: AddHi);
17091 }
17092
17093 return SDValue();
17094}
17095
17096// Collect the ultimate src of each of the mul node's operands, and confirm
17097// each operand is 8 bytes.
17098static std::optional<ByteProvider<SDValue>>
17099handleMulOperand(const SDValue &MulOperand) {
17100 auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0);
17101 if (!Byte0 || Byte0->isConstantZero()) {
17102 return std::nullopt;
17103 }
17104 auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0);
17105 if (Byte1 && !Byte1->isConstantZero()) {
17106 return std::nullopt;
17107 }
17108 return Byte0;
17109}
17110
17111static unsigned addPermMasks(unsigned First, unsigned Second) {
17112 unsigned FirstCs = First & 0x0c0c0c0c;
17113 unsigned SecondCs = Second & 0x0c0c0c0c;
17114 unsigned FirstNoCs = First & ~0x0c0c0c0c;
17115 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
17116
17117 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
17118 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
17119 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
17120 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
17121
17122 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
17123}
17124
17125struct DotSrc {
17126 SDValue SrcOp;
17127 int64_t PermMask;
17128 int64_t DWordOffset;
17129};
17130
17131static void placeSources(ByteProvider<SDValue> &Src0,
17132 ByteProvider<SDValue> &Src1,
17133 SmallVectorImpl<DotSrc> &Src0s,
17134 SmallVectorImpl<DotSrc> &Src1s, int Step) {
17135
17136 assert(Src0.Src.has_value() && Src1.Src.has_value());
17137 // Src0s and Src1s are empty, just place arbitrarily.
17138 if (Step == 0) {
17139 Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
17140 .DWordOffset: Src0.SrcOffset / 4});
17141 Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
17142 .DWordOffset: Src1.SrcOffset / 4});
17143 return;
17144 }
17145
17146 for (int BPI = 0; BPI < 2; BPI++) {
17147 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
17148 if (BPI == 1) {
17149 BPP = {Src1, Src0};
17150 }
17151 unsigned ZeroMask = 0x0c0c0c0c;
17152 unsigned FMask = 0xFF << (8 * (3 - Step));
17153
17154 unsigned FirstMask =
17155 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
17156 unsigned SecondMask =
17157 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
17158 // Attempt to find Src vector which contains our SDValue, if so, add our
17159 // perm mask to the existing one. If we are unable to find a match for the
17160 // first SDValue, attempt to find match for the second.
17161 int FirstGroup = -1;
17162 for (int I = 0; I < 2; I++) {
17163 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
17164 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
17165 return IterElt.SrcOp == *BPP.first.Src &&
17166 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
17167 };
17168
17169 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
17170 if (Match != Srcs.end()) {
17171 Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
17172 FirstGroup = I;
17173 break;
17174 }
17175 }
17176 if (FirstGroup != -1) {
17177 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
17178 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
17179 return IterElt.SrcOp == *BPP.second.Src &&
17180 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
17181 };
17182 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
17183 if (Match != Srcs.end()) {
17184 Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
17185 } else
17186 Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4});
17187 return;
17188 }
17189 }
17190
17191 // If we have made it here, then we could not find a match in Src0s or Src1s
17192 // for either Src0 or Src1, so just place them arbitrarily.
17193
17194 unsigned ZeroMask = 0x0c0c0c0c;
17195 unsigned FMask = 0xFF << (8 * (3 - Step));
17196
17197 Src0s.push_back(
17198 Elt: {.SrcOp: *Src0.Src,
17199 .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17200 .DWordOffset: Src0.SrcOffset / 4});
17201 Src1s.push_back(
17202 Elt: {.SrcOp: *Src1.Src,
17203 .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17204 .DWordOffset: Src1.SrcOffset / 4});
17205}
17206
17207static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
17208 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
17209 bool IsAny) {
17210
17211 // If we just have one source, just permute it accordingly.
17212 if (Srcs.size() == 1) {
17213 auto *Elt = Srcs.begin();
17214 auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
17215
17216 // v_perm will produce the original value
17217 if (Elt->PermMask == 0x3020100)
17218 return EltOp;
17219
17220 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
17221 N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32));
17222 }
17223
17224 auto *FirstElt = Srcs.begin();
17225 auto *SecondElt = std::next(x: FirstElt);
17226
17227 SmallVector<SDValue, 2> Perms;
17228
17229 // If we have multiple sources in the chain, combine them via perms (using
17230 // calculated perm mask) and Ors.
17231 while (true) {
17232 auto FirstMask = FirstElt->PermMask;
17233 auto SecondMask = SecondElt->PermMask;
17234
17235 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
17236 unsigned FirstPlusFour = FirstMask | 0x04040404;
17237 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
17238 // original 0x0C.
17239 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
17240
17241 auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
17242 auto FirstVal =
17243 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
17244 auto SecondVal =
17245 getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
17246
17247 Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal,
17248 N2: SecondVal,
17249 N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32)));
17250
17251 FirstElt = std::next(x: SecondElt);
17252 if (FirstElt == Srcs.end())
17253 break;
17254
17255 SecondElt = std::next(x: FirstElt);
17256 // If we only have a FirstElt, then just combine that into the cumulative
17257 // source node.
17258 if (SecondElt == Srcs.end()) {
17259 auto EltOp =
17260 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
17261
17262 Perms.push_back(
17263 Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
17264 N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32)));
17265 break;
17266 }
17267 }
17268
17269 assert(Perms.size() == 1 || Perms.size() == 2);
17270 return Perms.size() == 2
17271 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms[0], N2: Perms[1])
17272 : Perms[0];
17273}
17274
17275static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
17276 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
17277 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
17278 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
17279 EntryMask += ZeroMask;
17280 }
17281}
17282
17283static bool isMul(const SDValue Op) {
17284 auto Opcode = Op.getOpcode();
17285
17286 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
17287 Opcode == AMDGPUISD::MUL_I24);
17288}
17289
17290static std::optional<bool>
17291checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
17292 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
17293 const SDValue &S1Op, const SelectionDAG &DAG) {
17294 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
17295 // of the dot4 is irrelevant.
17296 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
17297 return false;
17298
17299 auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0);
17300 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
17301 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
17302 auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0);
17303 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
17304 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
17305
17306 assert(!(S0IsUnsigned && S0IsSigned));
17307 assert(!(S1IsUnsigned && S1IsSigned));
17308
17309 // There are 9 possible permutations of
17310 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
17311
17312 // In two permutations, the sign bits are known to be the same for both Ops,
17313 // so simply return Signed / Unsigned corresponding to the MSB
17314
17315 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
17316 return S0IsSigned;
17317
17318 // In another two permutations, the sign bits are known to be opposite. In
17319 // this case return std::nullopt to indicate a bad match.
17320
17321 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
17322 return std::nullopt;
17323
17324 // In the remaining five permutations, we don't know the value of the sign
17325 // bit for at least one Op. Since we have a valid ByteProvider, we know that
17326 // the upper bits must be extension bits. Thus, the only ways for the sign
17327 // bit to be unknown is if it was sign extended from unknown value, or if it
17328 // was any extended. In either case, it is correct to use the signed
17329 // version of the signedness semantics of dot4
17330
17331 // In two of such permutations, we known the sign bit is set for
17332 // one op, and the other is unknown. It is okay to used signed version of
17333 // dot4.
17334 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
17335 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
17336 return true;
17337
17338 // In one such permutation, we don't know either of the sign bits. It is okay
17339 // to used the signed version of dot4.
17340 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
17341 return true;
17342
17343 // In two of such permutations, we known the sign bit is unset for
17344 // one op, and the other is unknown. Return std::nullopt to indicate a
17345 // bad match.
17346 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
17347 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
17348 return std::nullopt;
17349
17350 llvm_unreachable("Fully covered condition");
17351}
17352
17353SDValue SITargetLowering::performAddCombine(SDNode *N,
17354 DAGCombinerInfo &DCI) const {
17355 SelectionDAG &DAG = DCI.DAG;
17356 EVT VT = N->getValueType(ResNo: 0);
17357 SDLoc SL(N);
17358 SDValue LHS = N->getOperand(Num: 0);
17359 SDValue RHS = N->getOperand(Num: 1);
17360
17361 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
17362 if (Subtarget->hasMad64_32()) {
17363 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
17364 return Folded;
17365 }
17366 }
17367
17368 if (SDValue V = reassociateScalarOps(N, DAG)) {
17369 return V;
17370 }
17371
17372 if (VT == MVT::i64) {
17373 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17374 return Folded;
17375 }
17376
17377 if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
17378 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
17379 SDValue TempNode(N, 0);
17380 std::optional<bool> IsSigned;
17381 SmallVector<DotSrc, 4> Src0s;
17382 SmallVector<DotSrc, 4> Src1s;
17383 SmallVector<SDValue, 4> Src2s;
17384
17385 // Match the v_dot4 tree, while collecting src nodes.
17386 int ChainLength = 0;
17387 for (int I = 0; I < 4; I++) {
17388 auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1;
17389 if (MulIdx == -1)
17390 break;
17391 auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0));
17392 if (!Src0)
17393 break;
17394 auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1));
17395 if (!Src1)
17396 break;
17397
17398 auto IterIsSigned = checkDot4MulSignedness(
17399 N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1,
17400 S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0),
17401 S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG);
17402 if (!IterIsSigned)
17403 break;
17404 if (!IsSigned)
17405 IsSigned = *IterIsSigned;
17406 if (*IterIsSigned != *IsSigned)
17407 break;
17408 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I);
17409 auto AddIdx = 1 - MulIdx;
17410 // Allow the special case where add (add (mul24, 0), mul24) became ->
17411 // add (mul24, mul24).
17412 if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) {
17413 Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx));
17414 auto Src0 =
17415 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0));
17416 if (!Src0)
17417 break;
17418 auto Src1 =
17419 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1));
17420 if (!Src1)
17421 break;
17422 auto IterIsSigned = checkDot4MulSignedness(
17423 N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1,
17424 S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0),
17425 S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG);
17426 if (!IterIsSigned)
17427 break;
17428 assert(IsSigned);
17429 if (*IterIsSigned != *IsSigned)
17430 break;
17431 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1);
17432 Src2s.push_back(Elt: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
17433 ChainLength = I + 2;
17434 break;
17435 }
17436
17437 TempNode = TempNode->getOperand(Num: AddIdx);
17438 Src2s.push_back(Elt: TempNode);
17439 ChainLength = I + 1;
17440 if (TempNode->getNumOperands() < 2)
17441 break;
17442 LHS = TempNode->getOperand(Num: 0);
17443 RHS = TempNode->getOperand(Num: 1);
17444 }
17445
17446 if (ChainLength < 2)
17447 return SDValue();
17448
17449 // Masks were constructed with assumption that we would find a chain of
17450 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
17451 // 0x0c) so they do not affect dot calculation.
17452 if (ChainLength < 4) {
17453 fixMasks(Srcs&: Src0s, ChainLength);
17454 fixMasks(Srcs&: Src1s, ChainLength);
17455 }
17456
17457 SDValue Src0, Src1;
17458
17459 // If we are just using a single source for both, and have permuted the
17460 // bytes consistently, we can just use the sources without permuting
17461 // (commutation).
17462 bool UseOriginalSrc = false;
17463 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
17464 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
17465 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
17466 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
17467 SmallVector<unsigned, 4> SrcBytes;
17468 auto Src0Mask = Src0s.begin()->PermMask;
17469 SrcBytes.push_back(Elt: Src0Mask & 0xFF000000);
17470 bool UniqueEntries = true;
17471 for (auto I = 1; I < 4; I++) {
17472 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
17473
17474 if (is_contained(Range&: SrcBytes, Element: NextByte)) {
17475 UniqueEntries = false;
17476 break;
17477 }
17478 SrcBytes.push_back(Elt: NextByte);
17479 }
17480
17481 if (UniqueEntries) {
17482 UseOriginalSrc = true;
17483
17484 auto *FirstElt = Src0s.begin();
17485 auto FirstEltOp =
17486 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
17487
17488 auto *SecondElt = Src1s.begin();
17489 auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
17490 DWordOffset: SecondElt->DWordOffset);
17491
17492 Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
17493 VT: MVT::getIntegerVT(BitWidth: 32));
17494 Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
17495 VT: MVT::getIntegerVT(BitWidth: 32));
17496 }
17497 }
17498
17499 if (!UseOriginalSrc) {
17500 Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
17501 Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
17502 }
17503
17504 assert(IsSigned);
17505 SDValue Src2 =
17506 DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s[ChainLength - 1], DL: SL, VT: MVT::i32);
17507
17508 SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4
17509 : Intrinsic::amdgcn_udot4,
17510 DL: SL, VT: MVT::i64);
17511
17512 assert(!VT.isVector());
17513 auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0,
17514 N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
17515
17516 return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT);
17517 }
17518
17519 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
17520 return SDValue();
17521
17522 // add x, zext (setcc) => uaddo_carry x, 0, setcc
17523 // add x, sext (setcc) => usubo_carry x, 0, setcc
17524 unsigned Opc = LHS.getOpcode();
17525 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
17526 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
17527 std::swap(a&: RHS, b&: LHS);
17528
17529 Opc = RHS.getOpcode();
17530 switch (Opc) {
17531 default:
17532 break;
17533 case ISD::ZERO_EXTEND:
17534 case ISD::SIGN_EXTEND:
17535 case ISD::ANY_EXTEND: {
17536 auto Cond = RHS.getOperand(i: 0);
17537 // If this won't be a real VOPC output, we would still need to insert an
17538 // extra instruction anyway.
17539 if (!isBoolSGPR(V: Cond))
17540 break;
17541 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
17542 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
17543 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
17544 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
17545 }
17546 case ISD::UADDO_CARRY: {
17547 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
17548 if (!isNullConstant(V: RHS.getOperand(i: 1)))
17549 break;
17550 SDValue Args[] = {LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2)};
17551 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args);
17552 }
17553 }
17554 return SDValue();
17555}
17556
17557SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
17558 DAGCombinerInfo &DCI) const {
17559 SelectionDAG &DAG = DCI.DAG;
17560 SDLoc DL(N);
17561 EVT VT = N->getValueType(ResNo: 0);
17562 SDValue N0 = N->getOperand(Num: 0);
17563 SDValue N1 = N->getOperand(Num: 1);
17564
17565 // The following folds transform PTRADDs into regular arithmetic in cases
17566 // where the PTRADD wouldn't be folded as an immediate offset into memory
17567 // instructions anyway. They are target-specific in that other targets might
17568 // prefer to not lose information about the pointer arithmetic.
17569
17570 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
17571 // Adapted from DAGCombiner::visitADDLikeCommutative.
17572 SDValue V, K;
17573 if (sd_match(N: N1, P: m_Shl(L: m_Neg(V: m_Value(N&: V)), R: m_Value(N&: K)))) {
17574 SDNodeFlags ShlFlags = N1->getFlags();
17575 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
17576 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
17577 // preserved.
17578 SDNodeFlags NewShlFlags =
17579 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
17580 ? SDNodeFlags::NoSignedWrap
17581 : SDNodeFlags();
17582 SDValue Inner = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: V, N2: K, Flags: NewShlFlags);
17583 DCI.AddToWorklist(N: Inner.getNode());
17584 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: Inner);
17585 }
17586
17587 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
17588 // performAddCombine.
17589 if (N1.getOpcode() == ISD::MUL) {
17590 if (Subtarget->hasMad64_32()) {
17591 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
17592 return Folded;
17593 }
17594 }
17595
17596 // If the 32 low bits of the constant are all zero, there is nothing to fold
17597 // into an immediate offset, so it's better to eliminate the unnecessary
17598 // addition for the lower 32 bits than to preserve the PTRADD.
17599 // Analogous to a fold in performAddCombine.
17600 if (VT == MVT::i64) {
17601 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17602 return Folded;
17603 }
17604
17605 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
17606 return SDValue();
17607
17608 SDValue X = N0;
17609 SDValue Y = N1.getOperand(i: 0);
17610 SDValue Z = N1.getOperand(i: 1);
17611 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Y);
17612 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Z);
17613
17614 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
17615 Y->isDivergent() != Z->isDivergent()) {
17616 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
17617 // y are uniform and z isn't.
17618 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
17619 // z are uniform and y isn't.
17620 // The goal is to push uniform operands up in the computation, so that they
17621 // can be handled with scalar operations. We can't use reassociateScalarOps
17622 // for this since it requires two identical commutative operations to
17623 // reassociate.
17624 if (Y->isDivergent())
17625 std::swap(a&: Y, b&: Z);
17626 // If both additions in the original were NUW, reassociation preserves that.
17627 SDNodeFlags ReassocFlags =
17628 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
17629 SDValue UniformInner = DAG.getMemBasePlusOffset(Base: X, Offset: Y, DL, Flags: ReassocFlags);
17630 DCI.AddToWorklist(N: UniformInner.getNode());
17631 return DAG.getMemBasePlusOffset(Base: UniformInner, Offset: Z, DL, Flags: ReassocFlags);
17632 }
17633
17634 return SDValue();
17635}
17636
17637static bool isCtlzOpc(unsigned Opc) {
17638 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;
17639}
17640
17641SDValue SITargetLowering::performSubCombine(SDNode *N,
17642 DAGCombinerInfo &DCI) const {
17643 SelectionDAG &DAG = DCI.DAG;
17644 EVT VT = N->getValueType(ResNo: 0);
17645
17646 if (VT == MVT::i64) {
17647 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17648 return Folded;
17649 }
17650
17651 if (VT != MVT::i32)
17652 return SDValue();
17653
17654 SDLoc SL(N);
17655 SDValue LHS = N->getOperand(Num: 0);
17656 SDValue RHS = N->getOperand(Num: 1);
17657
17658 // sub x, zext (setcc) => usubo_carry x, 0, setcc
17659 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
17660 unsigned Opc = RHS.getOpcode();
17661 switch (Opc) {
17662 default:
17663 break;
17664 case ISD::ZERO_EXTEND:
17665 case ISD::SIGN_EXTEND:
17666 case ISD::ANY_EXTEND: {
17667 auto Cond = RHS.getOperand(i: 0);
17668 // If this won't be a real VOPC output, we would still need to insert an
17669 // extra instruction anyway.
17670 if (!isBoolSGPR(V: Cond))
17671 break;
17672 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
17673 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
17674 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
17675 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
17676 }
17677 }
17678
17679 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
17680 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
17681 if (!isNullConstant(V: LHS.getOperand(i: 1)))
17682 return SDValue();
17683 SDValue Args[] = {LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2)};
17684 return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args);
17685 }
17686
17687 // sub (ctlz (xor x, (sra x, 31))), 1 -> ctls x.
17688 if (isOneConstant(V: RHS) && isCtlzOpc(Opc: LHS.getOpcode())) {
17689 SDValue CtlzSrc = LHS.getOperand(i: 0);
17690 // Check for xor x, (sra x, 31) pattern.
17691 if (CtlzSrc.getOpcode() == ISD::XOR) {
17692 SDValue X = CtlzSrc.getOperand(i: 0);
17693 SDValue SignExt = CtlzSrc.getOperand(i: 1);
17694 // Try both ordering of XOR operands.
17695 if (SignExt.getOpcode() != ISD::SRA)
17696 std::swap(a&: X, b&: SignExt);
17697 if (SignExt.getOpcode() == ISD::SRA && SignExt.getOperand(i: 0) == X) {
17698 ConstantSDNode *ShiftAmt =
17699 dyn_cast<ConstantSDNode>(Val: SignExt.getOperand(i: 1));
17700 unsigned BitWidth = X.getValueType().getScalarSizeInBits();
17701 if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1)
17702 return DAG.getNode(Opcode: ISD::CTLS, DL: SL, VT, Operand: X);
17703 }
17704 }
17705 }
17706
17707 return SDValue();
17708}
17709
17710SDValue SITargetLowering::performFAddCombine(SDNode *N,
17711 DAGCombinerInfo &DCI) const {
17712 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17713 return SDValue();
17714
17715 SelectionDAG &DAG = DCI.DAG;
17716 EVT VT = N->getValueType(ResNo: 0);
17717
17718 SDLoc SL(N);
17719 SDValue LHS = N->getOperand(Num: 0);
17720 SDValue RHS = N->getOperand(Num: 1);
17721
17722 // These should really be instruction patterns, but writing patterns with
17723 // source modifiers is a pain.
17724
17725 // fadd (fadd (a, a), b) -> mad 2.0, a, b
17726 if (LHS.getOpcode() == ISD::FADD) {
17727 SDValue A = LHS.getOperand(i: 0);
17728 if (A == LHS.getOperand(i: 1)) {
17729 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
17730 if (FusedOp != 0) {
17731 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
17732 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
17733 }
17734 }
17735 }
17736
17737 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
17738 if (RHS.getOpcode() == ISD::FADD) {
17739 SDValue A = RHS.getOperand(i: 0);
17740 if (A == RHS.getOperand(i: 1)) {
17741 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
17742 if (FusedOp != 0) {
17743 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
17744 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
17745 }
17746 }
17747 }
17748
17749 return SDValue();
17750}
17751
17752SDValue SITargetLowering::performFSubCombine(SDNode *N,
17753 DAGCombinerInfo &DCI) const {
17754 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17755 return SDValue();
17756
17757 SelectionDAG &DAG = DCI.DAG;
17758 SDLoc SL(N);
17759 EVT VT = N->getValueType(ResNo: 0);
17760 assert(!VT.isVector());
17761
17762 // Try to get the fneg to fold into the source modifier. This undoes generic
17763 // DAG combines and folds them into the mad.
17764 //
17765 // Only do this if we are not trying to support denormals. v_mad_f32 does
17766 // not support denormals ever.
17767 SDValue LHS = N->getOperand(Num: 0);
17768 SDValue RHS = N->getOperand(Num: 1);
17769 if (LHS.getOpcode() == ISD::FADD) {
17770 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
17771 SDValue A = LHS.getOperand(i: 0);
17772 if (A == LHS.getOperand(i: 1)) {
17773 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
17774 if (FusedOp != 0) {
17775 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
17776 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
17777
17778 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
17779 }
17780 }
17781 }
17782
17783 if (RHS.getOpcode() == ISD::FADD) {
17784 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
17785
17786 SDValue A = RHS.getOperand(i: 0);
17787 if (A == RHS.getOperand(i: 1)) {
17788 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
17789 if (FusedOp != 0) {
17790 const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT);
17791 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
17792 }
17793 }
17794 }
17795
17796 return SDValue();
17797}
17798
17799SDValue SITargetLowering::performFDivCombine(SDNode *N,
17800 DAGCombinerInfo &DCI) const {
17801 SelectionDAG &DAG = DCI.DAG;
17802 SDLoc SL(N);
17803 EVT VT = N->getValueType(ResNo: 0);
17804
17805 if (VT != MVT::f16 && VT != MVT::bf16)
17806 return SDValue();
17807
17808 SDValue LHS = N->getOperand(Num: 0);
17809 SDValue RHS = N->getOperand(Num: 1);
17810
17811 SDNodeFlags Flags = N->getFlags();
17812 SDNodeFlags RHSFlags = RHS->getFlags();
17813 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
17814 !RHS->hasOneUse())
17815 return SDValue();
17816
17817 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
17818 bool IsNegative = false;
17819 if (CLHS->isOne() || (IsNegative = CLHS->isMinusOne())) {
17820 // fdiv contract 1.0, (sqrt contract x) -> rsq
17821 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq)
17822 if (RHS.getOpcode() == ISD::FSQRT) {
17823 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
17824 SDValue SqrtOp = RHS.getOperand(i: 0);
17825 SDValue Rsq;
17826 if (isOperationLegal(Op: ISD::FSQRT, VT)) {
17827 // fsqrt legality correlates to rsq availability of the same type.
17828 Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: SqrtOp, Flags);
17829 } else if (VT == MVT::f16) {
17830 // Targets without 16-bit instructions (gfx6/gfx7) have no f16 rsq,
17831 // but v_rsq_f32 is more than accurate enough for f16. Unlike bf16,
17832 // every f16 value (including denormals) extends to a normal f32, and
17833 // an f16 rsq result is never denormal, so the f32 reciprocal square
17834 // root needs no denormal handling. Compute it in f32 and round back.
17835 SDValue Ext =
17836 DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: SqrtOp, Flags);
17837 SDValue F32Rsq =
17838 DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
17839 Rsq = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: F32Rsq,
17840 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
17841 } else {
17842 // bf16 shares f32's exponent range, so bf16 denormals would extend to
17843 // f32 denormals that v_rsq_f32 does not handle. Leave it expanded.
17844 return SDValue();
17845 }
17846 return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
17847 }
17848 }
17849 }
17850
17851 return SDValue();
17852}
17853
17854SDValue SITargetLowering::performFMulCombine(SDNode *N,
17855 DAGCombinerInfo &DCI) const {
17856 SelectionDAG &DAG = DCI.DAG;
17857 EVT VT = N->getValueType(ResNo: 0);
17858 EVT ScalarVT = VT.getScalarType();
17859 EVT IntVT = VT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
17860
17861 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
17862 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17863 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
17864 return SDValue();
17865 }
17866
17867 SDValue LHS = N->getOperand(Num: 0);
17868 SDValue RHS = N->getOperand(Num: 1);
17869
17870 // It is cheaper to realize i32 inline constants as compared against
17871 // materializing f16 or f64 (or even non-inline f32) values,
17872 // possible via ldexp usage, as shown below :
17873 //
17874 // Given : A = 2^a & B = 2^b ; where a and b are integers.
17875 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
17876 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
17877 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17878 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
17879 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(N: RHS.getOperand(i: 1));
17880 if (!TrueNode)
17881 return SDValue();
17882 const ConstantFPSDNode *FalseNode =
17883 isConstOrConstSplatFP(N: RHS.getOperand(i: 2));
17884 if (!FalseNode)
17885 return SDValue();
17886
17887 if (TrueNode->isNegative() != FalseNode->isNegative())
17888 return SDValue();
17889
17890 // For f32, only non-inline constants should be transformed.
17891 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17892 if (ScalarVT == MVT::f32 &&
17893 TII->isInlineConstant(Imm: TrueNode->getValueAPF()) &&
17894 TII->isInlineConstant(Imm: FalseNode->getValueAPF()))
17895 return SDValue();
17896
17897 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
17898 if (TrueNodeExpVal == INT_MIN)
17899 return SDValue();
17900 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
17901 if (FalseNodeExpVal == INT_MIN)
17902 return SDValue();
17903
17904 SDLoc SL(N);
17905 SDValue SelectNode =
17906 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: IntVT, N1: RHS.getOperand(i: 0),
17907 N2: DAG.getSignedConstant(Val: TrueNodeExpVal, DL: SL, VT: IntVT),
17908 N3: DAG.getSignedConstant(Val: FalseNodeExpVal, DL: SL, VT: IntVT));
17909
17910 LHS = TrueNode->isNegative()
17911 ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS, Flags: LHS->getFlags())
17912 : LHS;
17913
17914 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: LHS, N2: SelectNode, Flags: N->getFlags());
17915 }
17916
17917 return SDValue();
17918}
17919
17920SDValue SITargetLowering::performFMACombine(SDNode *N,
17921 DAGCombinerInfo &DCI) const {
17922 SelectionDAG &DAG = DCI.DAG;
17923 EVT VT = N->getValueType(ResNo: 0);
17924 SDLoc SL(N);
17925
17926 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17927 return SDValue();
17928
17929 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17930 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17931 SDValue Op1 = N->getOperand(Num: 0);
17932 SDValue Op2 = N->getOperand(Num: 1);
17933 SDValue FMA = N->getOperand(Num: 2);
17934
17935 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17936 Op2.getOpcode() != ISD::FP_EXTEND)
17937 return SDValue();
17938
17939 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17940 // regardless of the denorm mode setting. Therefore,
17941 // fp-contract is sufficient to allow generating fdot2.
17942 const TargetOptions &Options = DAG.getTarget().Options;
17943 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17944 (N->getFlags().hasAllowContract() &&
17945 FMA->getFlags().hasAllowContract())) {
17946 Op1 = Op1.getOperand(i: 0);
17947 Op2 = Op2.getOperand(i: 0);
17948 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17949 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17950 return SDValue();
17951
17952 SDValue Vec1 = Op1.getOperand(i: 0);
17953 SDValue Idx1 = Op1.getOperand(i: 1);
17954 SDValue Vec2 = Op2.getOperand(i: 0);
17955
17956 SDValue FMAOp1 = FMA.getOperand(i: 0);
17957 SDValue FMAOp2 = FMA.getOperand(i: 1);
17958 SDValue FMAAcc = FMA.getOperand(i: 2);
17959
17960 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17961 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17962 return SDValue();
17963
17964 FMAOp1 = FMAOp1.getOperand(i: 0);
17965 FMAOp2 = FMAOp2.getOperand(i: 0);
17966 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17967 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17968 return SDValue();
17969
17970 SDValue Vec3 = FMAOp1.getOperand(i: 0);
17971 SDValue Vec4 = FMAOp2.getOperand(i: 0);
17972 SDValue Idx2 = FMAOp1.getOperand(i: 1);
17973
17974 if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) ||
17975 // Idx1 and Idx2 cannot be the same.
17976 Idx1 == Idx2)
17977 return SDValue();
17978
17979 if (Vec1 == Vec2 || Vec3 == Vec4)
17980 return SDValue();
17981
17982 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17983 return SDValue();
17984
17985 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17986 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc,
17987 N4: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
17988 }
17989 }
17990 return SDValue();
17991}
17992
17993// Given a double-precision ordered or unordered comparison, return the
17994// condition code for an equivalent integral comparison of the operands' upper
17995// 32 bits, or `SETCC_INVALID` if not possible.
17996// For simplicity, no simplification occurs if the operands are not both known
17997// to have sign bit zero.
17998//
17999// EQ/NE:
18000// If LHS.lo32 == RHS.lo32:
18001// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
18002// If LHS.lo32 != RHS.lo32:
18003// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
18004// The reduction is not possible if operands may be +0 and -0.
18005// For ordered eq / unordered ne, at most one operand may be NaN.
18006// For unordered eq / ordered ne, neither operand can be NaN.
18007//
18008// LT/GE:
18009// If LHS.lo32 >= RHS.lo32 (unsigned):
18010// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
18011// If LHS.lo32 < RHS.lo32 (unsigned):
18012// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
18013// The reduction is only supported if both operands are nonnegative.
18014// For ordered lt / unordered ge, the RHS cannot be NaN.
18015// For unordered lt / ordered ge, neither operand can be NaN.
18016//
18017// LE/GT:
18018// If LHS.lo32 > RHS.lo32 (unsigned):
18019// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
18020// If LHS.lo32 <= RHS.lo32 (unsigned):
18021// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
18022// The reduction is only supported if both operands are nonnegative.
18023// For unordered le / ordered gt, the LHS cannot be NaN.
18024// For ordered le / unordered gt, neither operand can be NaN.
18025static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC,
18026 const SDValue LHS,
18027 const SDValue RHS,
18028 const SelectionDAG &DAG) {
18029 EVT VT = LHS.getValueType();
18030 assert(VT == MVT::f64 && "Incorrect operand type!");
18031
18032 const KnownBits RHSBits = DAG.computeKnownBits(Op: RHS);
18033 // Bail if RHS sign bit is not known to be zero.
18034 if (!RHSBits.Zero.isSignBitSet())
18035 return ISD::SETCC_INVALID;
18036
18037 const KnownBits RHSKnownLo32 = RHSBits.trunc(BitWidth: 32);
18038 const KnownFPClass RHSFPClass =
18039 KnownFPClass::bitcast(FltSemantics: VT.getFltSemantics(), Bits: RHSBits);
18040 const bool RHSMaybeNaN = !RHSFPClass.isKnownNeverNaN();
18041
18042 const KnownBits LHSBits = DAG.computeKnownBits(Op: LHS);
18043 const KnownBits LHSKnownLo32 = LHSBits.trunc(BitWidth: 32);
18044 const KnownFPClass LHSFPClass =
18045 KnownFPClass::bitcast(FltSemantics: VT.getFltSemantics(), Bits: LHSBits);
18046 const bool LHSMaybeNaN = !LHSFPClass.isKnownNeverNaN();
18047
18048 // Bail if LHS sign bit is not known to be zero.
18049 if (!LHSBits.Zero.isSignBitSet())
18050 return ISD::SETCC_INVALID;
18051
18052 switch (CC) {
18053 default:
18054 break;
18055 case ISD::SETEQ:
18056 case ISD::SETOEQ:
18057 case ISD::SETUEQ:
18058 case ISD::SETONE:
18059 case ISD::SETUNE: {
18060 // OEQ should be false if either operand is NaN, so it suffices that at
18061 // least one operand is not NaN.
18062 if (CC == ISD::SETOEQ && LHSMaybeNaN && RHSMaybeNaN)
18063 break;
18064 // UEQ should be true if either operand is NaN, but this cannot be checked
18065 // on underlying bits.
18066 if (CC == ISD::SETUEQ && (LHSMaybeNaN || RHSMaybeNaN))
18067 break;
18068 // ONE should be false if either operand is NaN, but this cannot be
18069 // checked on underlying bits.
18070 if (CC == ISD::SETONE && (LHSMaybeNaN || RHSMaybeNaN))
18071 break;
18072 // UNE should be true if either operand is NaN, so it suffices that they
18073 // are not both NaN.
18074 if (CC == ISD::SETUNE && LHSMaybeNaN && RHSMaybeNaN)
18075 break;
18076
18077 const std::optional<bool> KnownEq =
18078 KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
18079
18080 if (!KnownEq)
18081 break;
18082
18083 if (*KnownEq)
18084 return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)
18085 ? ISD::SETEQ
18086 : ISD::SETNE;
18087
18088 return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)
18089 ? ISD::SETFALSE
18090 : ISD::SETTRUE;
18091 }
18092 case ISD::SETLT:
18093 case ISD::SETOLT:
18094 case ISD::SETULT:
18095 case ISD::SETGE:
18096 case ISD::SETOGE:
18097 case ISD::SETUGE: {
18098 // OLT should be false if either operand is NaN.
18099 // Since NaNs have maximum exponent and nonzero mantissa, false positives
18100 // are only possible if the RHS is NaN. (No issue with RHS == +inf since
18101 // the inequality is strict)
18102 if (CC == ISD::SETOLT && RHSMaybeNaN)
18103 break;
18104 // ULT should be true if either operand is NaN, but this cannot be ensured
18105 // with a truncated comparison.
18106 if (CC == ISD::SETULT && (LHSMaybeNaN || RHSMaybeNaN))
18107 break;
18108 // OGE should be false if either operand is NaN, but this cannot be
18109 // ensured with a truncated comparison.
18110 if (CC == ISD::SETOGE && (LHSMaybeNaN || RHSMaybeNaN))
18111 break;
18112 // UGE should be true if either operand is NaN.
18113 // False negatives are only possible if the RHS is NaN.
18114 // (No issue with RHS == +inf since the inequality is inclusive)
18115 if (CC == ISD::SETUGE && RHSMaybeNaN)
18116 break;
18117
18118 const std::optional<bool> KnownUge =
18119 KnownBits::uge(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
18120
18121 if (!KnownUge)
18122 break;
18123
18124 if (*KnownUge) {
18125 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
18126 return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)
18127 ? ISD::SETLT
18128 : ISD::SETGE;
18129 }
18130 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
18131 return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)
18132 ? ISD::SETLE
18133 : ISD::SETGT;
18134 }
18135 case ISD::SETLE:
18136 case ISD::SETOLE:
18137 case ISD::SETULE:
18138 case ISD::SETGT:
18139 case ISD::SETOGT:
18140 case ISD::SETUGT: {
18141 // OLE should be false if either operand is NaN, but this cannot be
18142 // ensured with a truncated comparison.
18143 if (CC == ISD::SETOLE && (LHSMaybeNaN || RHSMaybeNaN))
18144 break;
18145 // ULE should be true if either operand is NaN.
18146 // False negatives are only possible if the LHS is NaN.
18147 // (No issue with LHS == +inf since the inequality is inclusive)
18148 if (CC == ISD::SETULE && LHSMaybeNaN)
18149 break;
18150 // OGT should be false if either operand is NaN.
18151 // False positives are only possible if the LHS is NaN.
18152 // (No issue with LHS == +inf since the inequality is strict)
18153 if (CC == ISD::SETOGT && LHSMaybeNaN)
18154 break;
18155 // UGT should be true if either operand is NaN, but this cannot be ensured
18156 // with a truncated comparison.
18157 if (CC == ISD::SETUGT && (LHSMaybeNaN || RHSMaybeNaN))
18158 break;
18159
18160 const std::optional<bool> KnownUle =
18161 KnownBits::ule(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
18162
18163 if (!KnownUle)
18164 break;
18165
18166 if (*KnownUle) {
18167 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
18168 return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)
18169 ? ISD::SETLE
18170 : ISD::SETGT;
18171 }
18172 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
18173 return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)
18174 ? ISD::SETLT
18175 : ISD::SETGE;
18176 }
18177 }
18178
18179 return ISD::SETCC_INVALID;
18180}
18181
18182SDValue SITargetLowering::performSetCCCombine(SDNode *N,
18183 DAGCombinerInfo &DCI) const {
18184 SelectionDAG &DAG = DCI.DAG;
18185 SDLoc SL(N);
18186
18187 SDValue LHS = N->getOperand(Num: 0);
18188 SDValue RHS = N->getOperand(Num: 1);
18189 EVT VT = LHS.getValueType();
18190 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
18191
18192 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
18193 if (!CRHS) {
18194 CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
18195 if (CRHS) {
18196 std::swap(a&: LHS, b&: RHS);
18197 CC = getSetCCSwappedOperands(Operation: CC);
18198 }
18199 }
18200
18201 if (CRHS) {
18202 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
18203 isBoolSGPR(V: LHS.getOperand(i: 0))) {
18204 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
18205 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
18206 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
18207 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
18208 if ((CRHS->isAllOnes() &&
18209 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
18210 (CRHS->isZero() &&
18211 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
18212 return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
18213 N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
18214 if ((CRHS->isAllOnes() &&
18215 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
18216 (CRHS->isZero() &&
18217 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
18218 return LHS.getOperand(i: 0);
18219 }
18220
18221 const APInt &CRHSVal = CRHS->getAPIntValue();
18222 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
18223 LHS.getOpcode() == ISD::SELECT &&
18224 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
18225 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) &&
18226 isBoolSGPR(V: LHS.getOperand(i: 0))) {
18227 // Given CT != FT:
18228 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
18229 // setcc (select cc, CT, CF), CF, ne => cc
18230 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
18231 // setcc (select cc, CT, CF), CT, eq => cc
18232 const APInt &CT = LHS.getConstantOperandAPInt(i: 1);
18233 const APInt &CF = LHS.getConstantOperandAPInt(i: 2);
18234
18235 if (CT != CF) {
18236 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
18237 (CT == CRHSVal && CC == ISD::SETNE))
18238 return DAG.getNOT(DL: SL, Val: LHS.getOperand(i: 0), VT: MVT::i1);
18239 if ((CF == CRHSVal && CC == ISD::SETNE) ||
18240 (CT == CRHSVal && CC == ISD::SETEQ))
18241 return LHS.getOperand(i: 0);
18242 }
18243 }
18244 }
18245
18246 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
18247 // following cases where information about the lower 32-bits of its operands
18248 // is known:
18249 //
18250 // If LHS.lo32 == RHS.lo32:
18251 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
18252 // If LHS.lo32 != RHS.lo32:
18253 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
18254 // If LHS.lo32 >= RHS.lo32 (unsigned):
18255 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
18256 // If LHS.lo32 > RHS.lo32 (unsigned):
18257 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
18258 // If LHS.lo32 <= RHS.lo32 (unsigned):
18259 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
18260 // If LHS.lo32 < RHS.lo32 (unsigned):
18261 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
18262 if (VT == MVT::i64) {
18263 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(Op: LHS).trunc(BitWidth: 32);
18264 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(Op: RHS).trunc(BitWidth: 32);
18265
18266 // NewCC is valid iff we can truncate the setcc to only test the upper 32
18267 // bits
18268 ISD::CondCode NewCC = ISD::SETCC_INVALID;
18269
18270 switch (CC) {
18271 default:
18272 break;
18273 case ISD::SETEQ: {
18274 const std::optional<bool> KnownEq =
18275 KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
18276 if (KnownEq)
18277 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
18278
18279 break;
18280 }
18281 case ISD::SETNE: {
18282 const std::optional<bool> KnownEq =
18283 KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
18284 if (KnownEq)
18285 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
18286
18287 break;
18288 }
18289 case ISD::SETULT:
18290 case ISD::SETUGE:
18291 case ISD::SETLT:
18292 case ISD::SETGE: {
18293 const std::optional<bool> KnownUge =
18294 KnownBits::uge(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
18295 if (KnownUge) {
18296 if (*KnownUge) {
18297 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
18298 NewCC = CC;
18299 } else {
18300 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
18301 NewCC = CC == ISD::SETULT ? ISD::SETULE
18302 : CC == ISD::SETUGE ? ISD::SETUGT
18303 : CC == ISD::SETLT ? ISD::SETLE
18304 : ISD::SETGT;
18305 }
18306 }
18307 break;
18308 }
18309 case ISD::SETULE:
18310 case ISD::SETUGT:
18311 case ISD::SETLE:
18312 case ISD::SETGT: {
18313 const std::optional<bool> KnownUle =
18314 KnownBits::ule(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
18315 if (KnownUle) {
18316 if (*KnownUle) {
18317 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
18318 NewCC = CC;
18319 } else {
18320 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
18321 NewCC = CC == ISD::SETULE ? ISD::SETULT
18322 : CC == ISD::SETUGT ? ISD::SETUGE
18323 : CC == ISD::SETLE ? ISD::SETLT
18324 : ISD::SETGE;
18325 }
18326 }
18327 break;
18328 }
18329 }
18330
18331 if (NewCC != ISD::SETCC_INVALID)
18332 return DAG.getSetCC(DL: SL, VT: N->getValueType(ResNo: 0), LHS: getHiHalf64(Op: LHS, DAG),
18333 RHS: getHiHalf64(Op: RHS, DAG), Cond: NewCC);
18334 }
18335
18336 // Eliminate setcc by using carryout from add/sub instruction
18337
18338 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
18339 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
18340 // similarly for subtraction
18341
18342 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
18343 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
18344
18345 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
18346 sd_match(N: LHS, P: m_Add(L: m_Specific(N: RHS), R: m_Value()))) ||
18347 (CC == ISD::SETUGT &&
18348 sd_match(N: LHS, P: m_Sub(L: m_Specific(N: RHS), R: m_Value()))) ||
18349 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
18350 sd_match(N: LHS, P: m_Add(L: m_Value(), R: m_One()))))) {
18351 bool IsAdd = LHS.getOpcode() == ISD::ADD;
18352
18353 SDValue Op0 = LHS.getOperand(i: 0);
18354 SDValue Op1 = LHS.getOperand(i: 1);
18355
18356 SDValue Op0Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op0);
18357 SDValue Op1Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op1);
18358
18359 SDValue Op0Hi = getHiHalf64(Op: Op0, DAG);
18360 SDValue Op1Hi = getHiHalf64(Op: Op1, DAG);
18361
18362 SDValue NodeLo =
18363 DAG.getNode(Opcode: IsAdd ? ISD::UADDO : ISD::USUBO, DL: SL,
18364 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1), Ops: {Op0Lo, Op1Lo});
18365
18366 SDValue CarryInHi = NodeLo.getValue(R: 1);
18367 SDValue NodeHi = DAG.getNode(Opcode: IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
18368 DL: SL, VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1),
18369 Ops: {Op0Hi, Op1Hi, CarryInHi});
18370
18371 SDValue ResultLo = NodeLo.getValue(R: 0);
18372 SDValue ResultHi = NodeHi.getValue(R: 0);
18373
18374 SDValue JoinedResult =
18375 DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {ResultLo, ResultHi});
18376
18377 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: JoinedResult);
18378 SDValue Overflow = NodeHi.getValue(R: 1);
18379 DCI.CombineTo(N: LHS.getNode(), Res: Result);
18380 return Overflow;
18381 }
18382
18383 if (VT != MVT::f32 && VT != MVT::f64 &&
18384 (!Subtarget->has16BitInsts() || VT != MVT::f16))
18385 return SDValue();
18386
18387 // Match isinf/isfinite pattern
18388 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
18389 // (fcmp one (fabs x), inf) -> (fp_class x,
18390 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
18391 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
18392 LHS.getOpcode() == ISD::FABS) {
18393 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
18394 if (!CRHS)
18395 return SDValue();
18396
18397 const APFloat &APF = CRHS->getValueAPF();
18398 if (APF.isInfinity() && !APF.isNegative()) {
18399 const unsigned IsInfMask =
18400 SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
18401 const unsigned IsFiniteMask =
18402 SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |
18403 SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |
18404 SIInstrFlags::P_SUBNORMAL;
18405 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
18406 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
18407 N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32));
18408 }
18409 }
18410
18411 if (VT == MVT::f64) {
18412 ISD::CondCode HiHalfCC = tryReduceF64CompareToHiHalf(CC, LHS, RHS, DAG);
18413 if (HiHalfCC != ISD::SETCC_INVALID)
18414 return DAG.getSetCC(DL: SL, VT: N->getValueType(ResNo: 0), LHS: getHiHalf64(Op: LHS, DAG),
18415 RHS: getHiHalf64(Op: RHS, DAG), Cond: HiHalfCC);
18416 }
18417
18418 return SDValue();
18419}
18420
18421SDValue
18422SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
18423 DAGCombinerInfo &DCI) const {
18424 SelectionDAG &DAG = DCI.DAG;
18425 SDLoc SL(N);
18426 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
18427
18428 SDValue Src = N->getOperand(Num: 0);
18429 SDValue Shift = N->getOperand(Num: 0);
18430
18431 // TODO: Extend type shouldn't matter (assuming legal types).
18432 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
18433 Shift = Shift.getOperand(i: 0);
18434
18435 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
18436 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
18437 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
18438 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
18439 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
18440 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
18441 if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) {
18442 SDValue Shifted = DAG.getZExtOrTrunc(
18443 Op: Shift.getOperand(i: 0), DL: SDLoc(Shift.getOperand(i: 0)), VT: MVT::i32);
18444
18445 unsigned ShiftOffset = 8 * Offset;
18446 if (Shift.getOpcode() == ISD::SHL)
18447 ShiftOffset -= C->getZExtValue();
18448 else
18449 ShiftOffset += C->getZExtValue();
18450
18451 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
18452 return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, DL: SL,
18453 VT: MVT::f32, Operand: Shifted);
18454 }
18455 }
18456 }
18457
18458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18459 APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8);
18460 if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
18461 // We simplified Src. If this node is not dead, visit it again so it is
18462 // folded properly.
18463 if (N->getOpcode() != ISD::DELETED_NODE)
18464 DCI.AddToWorklist(N);
18465 return SDValue(N, 0);
18466 }
18467
18468 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
18469 if (SDValue DemandedSrc =
18470 TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG))
18471 return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc);
18472
18473 return SDValue();
18474}
18475
18476SDValue SITargetLowering::performClampCombine(SDNode *N,
18477 DAGCombinerInfo &DCI) const {
18478 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
18479 if (!CSrc)
18480 return SDValue();
18481
18482 const MachineFunction &MF = DCI.DAG.getMachineFunction();
18483 const APFloat &F = CSrc->getValueAPF();
18484 APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
18485 if (F < Zero ||
18486 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
18487 return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
18488 }
18489
18490 APFloat One(F.getSemantics(), "1.0");
18491 if (F > One)
18492 return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
18493
18494 return SDValue(CSrc, 0);
18495}
18496
18497SDValue SITargetLowering::performSelectCombine(SDNode *N,
18498 DAGCombinerInfo &DCI) const {
18499
18500 // Try to fold CMP + SELECT patterns with shared constants (both FP and
18501 // integer).
18502 // Detect when CMP and SELECT use the same constant and fold them to avoid
18503 // loading the constant twice. Specifically handles patterns like:
18504 // %cmp = icmp eq i32 %val, 4242
18505 // %sel = select i1 %cmp, i32 4242, i32 %other
18506 // It can be optimized to reuse %val instead of 4242 in select.
18507 SDValue Cond = N->getOperand(Num: 0);
18508 SDValue TrueVal = N->getOperand(Num: 1);
18509 SDValue FalseVal = N->getOperand(Num: 2);
18510
18511 // Check if condition is a comparison.
18512 if (Cond.getOpcode() != ISD::SETCC)
18513 return SDValue();
18514
18515 SDValue LHS = Cond.getOperand(i: 0);
18516 SDValue RHS = Cond.getOperand(i: 1);
18517 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
18518
18519 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
18520 bool isInteger = LHS.getValueType().isInteger();
18521
18522 // Handle simple floating-point and integer types only.
18523 if (!isFloatingPoint && !isInteger)
18524 return SDValue();
18525
18526 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
18527 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
18528 if (!isEquality && !isNonEquality)
18529 return SDValue();
18530
18531 SDValue ArgVal, ConstVal;
18532 if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: RHS)) ||
18533 (isInteger && isa<ConstantSDNode>(Val: RHS))) {
18534 ConstVal = RHS;
18535 ArgVal = LHS;
18536 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: LHS)) ||
18537 (isInteger && isa<ConstantSDNode>(Val: LHS))) {
18538 ConstVal = LHS;
18539 ArgVal = RHS;
18540 } else {
18541 return SDValue();
18542 }
18543
18544 // Skip optimization for inlinable immediates.
18545 if (isFloatingPoint) {
18546 const APFloat &Val = cast<ConstantFPSDNode>(Val&: ConstVal)->getValueAPF();
18547 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Imm: Val))
18548 return SDValue();
18549 } else {
18550 const std::optional<int64_t> Val =
18551 cast<ConstantSDNode>(Val&: ConstVal)->getAPIntValue().trySExtValue();
18552 if (Val && AMDGPU::isInlinableIntLiteral(Literal: *Val))
18553 return SDValue();
18554 }
18555
18556 // For equality and non-equality comparisons, patterns:
18557 // select (setcc x, const), const, y -> select (setcc x, const), x, y
18558 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
18559 if (!(isEquality && TrueVal == ConstVal) &&
18560 !(isNonEquality && FalseVal == ConstVal))
18561 return SDValue();
18562
18563 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
18564 SDValue SelectRHS =
18565 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
18566 return DCI.DAG.getNode(Opcode: ISD::SELECT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Cond,
18567 N2: SelectLHS, N3: SelectRHS);
18568}
18569
18570SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
18571 DAGCombinerInfo &DCI) const {
18572 switch (N->getOpcode()) {
18573 case ISD::ABS:
18574 if (SDValue Res = promoteUniformUnaryOpToI32(Op: SDValue(N, 0), DCI))
18575 return Res;
18576 break;
18577 case ISD::ADD:
18578 case ISD::SUB:
18579 case ISD::SHL:
18580 case ISD::SRL:
18581 case ISD::SRA:
18582 case ISD::AND:
18583 case ISD::OR:
18584 case ISD::XOR:
18585 case ISD::MUL:
18586 case ISD::SETCC:
18587 case ISD::SELECT:
18588 case ISD::SMIN:
18589 case ISD::SMAX:
18590 case ISD::UMIN:
18591 case ISD::UMAX:
18592 case ISD::USUBSAT:
18593 if (auto Res = promoteUniformOpToI32(Op: SDValue(N, 0), DCI))
18594 return Res;
18595 break;
18596 default:
18597 break;
18598 }
18599
18600 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
18601 return SDValue();
18602
18603 switch (N->getOpcode()) {
18604 case ISD::ADD:
18605 return performAddCombine(N, DCI);
18606 case ISD::PTRADD:
18607 return performPtrAddCombine(N, DCI);
18608 case ISD::SUB:
18609 return performSubCombine(N, DCI);
18610 case ISD::FADD:
18611 return performFAddCombine(N, DCI);
18612 case ISD::FSUB:
18613 return performFSubCombine(N, DCI);
18614 case ISD::FDIV:
18615 return performFDivCombine(N, DCI);
18616 case ISD::FMUL:
18617 return performFMulCombine(N, DCI);
18618 case ISD::SETCC:
18619 return performSetCCCombine(N, DCI);
18620 case ISD::SELECT:
18621 if (auto Res = performSelectCombine(N, DCI))
18622 return Res;
18623 break;
18624 case ISD::FMAXNUM:
18625 case ISD::FMINNUM:
18626 case ISD::FMAXNUM_IEEE:
18627 case ISD::FMINNUM_IEEE:
18628 case ISD::FMAXIMUM:
18629 case ISD::FMINIMUM:
18630 case ISD::FMAXIMUMNUM:
18631 case ISD::FMINIMUMNUM:
18632 case ISD::SMAX:
18633 case ISD::SMIN:
18634 case ISD::UMAX:
18635 case ISD::UMIN:
18636 case AMDGPUISD::FMIN_LEGACY:
18637 case AMDGPUISD::FMAX_LEGACY:
18638 return performMinMaxCombine(N, DCI);
18639 case ISD::FMA:
18640 return performFMACombine(N, DCI);
18641 case ISD::AND:
18642 return performAndCombine(N, DCI);
18643 case ISD::OR:
18644 return performOrCombine(N, DCI);
18645 case ISD::FSHR: {
18646 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18647 if (N->getValueType(ResNo: 0) == MVT::i32 && N->isDivergent() &&
18648 TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
18649 return matchPERM(N, DCI);
18650 }
18651 break;
18652 }
18653 case ISD::XOR:
18654 return performXorCombine(N, DCI);
18655 case ISD::ANY_EXTEND:
18656 case ISD::ZERO_EXTEND:
18657 return performZeroOrAnyExtendCombine(N, DCI);
18658 case ISD::SIGN_EXTEND_INREG:
18659 return performSignExtendInRegCombine(N, DCI);
18660 case AMDGPUISD::FP_CLASS:
18661 return performClassCombine(N, DCI);
18662 case ISD::FCANONICALIZE:
18663 return performFCanonicalizeCombine(N, DCI);
18664 case AMDGPUISD::RCP:
18665 return performRcpCombine(N, DCI);
18666 case ISD::FLDEXP:
18667 case AMDGPUISD::FRACT:
18668 case AMDGPUISD::RSQ:
18669 case AMDGPUISD::RCP_LEGACY:
18670 case AMDGPUISD::RCP_IFLAG:
18671 case AMDGPUISD::RSQ_CLAMP: {
18672 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
18673 SDValue Src = N->getOperand(Num: 0);
18674 if (Src.isUndef())
18675 return Src;
18676 break;
18677 }
18678 case ISD::SINT_TO_FP:
18679 case ISD::UINT_TO_FP:
18680 return performUCharToFloatCombine(N, DCI);
18681 case ISD::FCOPYSIGN:
18682 return performFCopySignCombine(N, DCI);
18683 case AMDGPUISD::CVT_F32_UBYTE0:
18684 case AMDGPUISD::CVT_F32_UBYTE1:
18685 case AMDGPUISD::CVT_F32_UBYTE2:
18686 case AMDGPUISD::CVT_F32_UBYTE3:
18687 return performCvtF32UByteNCombine(N, DCI);
18688 case AMDGPUISD::FMED3:
18689 return performFMed3Combine(N, DCI);
18690 case AMDGPUISD::CVT_PKRTZ_F16_F32:
18691 return performCvtPkRTZCombine(N, DCI);
18692 case AMDGPUISD::CLAMP:
18693 return performClampCombine(N, DCI);
18694 case ISD::SCALAR_TO_VECTOR: {
18695 SelectionDAG &DAG = DCI.DAG;
18696 EVT VT = N->getValueType(ResNo: 0);
18697
18698 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
18699 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
18700 SDLoc SL(N);
18701 SDValue Src = N->getOperand(Num: 0);
18702 EVT EltVT = Src.getValueType();
18703 if (EltVT != MVT::i16)
18704 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src);
18705
18706 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src);
18707 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
18708 }
18709
18710 break;
18711 }
18712 case ISD::EXTRACT_VECTOR_ELT:
18713 return performExtractVectorEltCombine(N, DCI);
18714 case ISD::INSERT_VECTOR_ELT:
18715 return performInsertVectorEltCombine(N, DCI);
18716 case ISD::FP_ROUND:
18717 return performFPRoundCombine(N, DCI);
18718 case ISD::LOAD: {
18719 if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
18720 return Widened;
18721 [[fallthrough]];
18722 }
18723 default: {
18724 if (!DCI.isBeforeLegalize()) {
18725 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
18726 return performMemSDNodeCombine(N: MemNode, DCI);
18727 }
18728
18729 break;
18730 }
18731 }
18732
18733 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
18734}
18735
18736/// Helper function for adjustWritemask
18737static unsigned SubIdx2Lane(unsigned Idx) {
18738 switch (Idx) {
18739 default:
18740 return ~0u;
18741 case AMDGPU::sub0:
18742 return 0;
18743 case AMDGPU::sub1:
18744 return 1;
18745 case AMDGPU::sub2:
18746 return 2;
18747 case AMDGPU::sub3:
18748 return 3;
18749 case AMDGPU::sub4:
18750 return 4; // Possible with TFE/LWE
18751 }
18752}
18753
18754/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
18755SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
18756 SelectionDAG &DAG) const {
18757 unsigned Opcode = Node->getMachineOpcode();
18758
18759 // Subtract 1 because the vdata output is not a MachineSDNode operand.
18760 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::d16) - 1;
18761 if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx))
18762 return Node; // not implemented for D16
18763
18764 SDNode *Users[5] = {nullptr};
18765 unsigned Lane = 0;
18766 unsigned DmaskIdx =
18767 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::dmask) - 1;
18768 unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
18769 unsigned NewDmask = 0;
18770 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::tfe) - 1;
18771 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::lwe) - 1;
18772 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) ||
18773 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx));
18774 unsigned TFCLane = 0;
18775 bool HasChain = Node->getNumValues() > 1;
18776
18777 if (OldDmask == 0) {
18778 // These are folded out, but on the chance it happens don't assert.
18779 return Node;
18780 }
18781
18782 unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
18783 // Work out which is the TFE/LWE lane if that is enabled.
18784 if (UsesTFC) {
18785 TFCLane = OldBitsSet;
18786 }
18787
18788 // Try to figure out the used register components
18789 for (SDUse &Use : Node->uses()) {
18790
18791 // Don't look at users of the chain.
18792 if (Use.getResNo() != 0)
18793 continue;
18794
18795 SDNode *User = Use.getUser();
18796
18797 // Abort if we can't understand the usage
18798 if (!User->isMachineOpcode() ||
18799 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
18800 return Node;
18801
18802 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
18803 // Note that subregs are packed, i.e. Lane==0 is the first bit set
18804 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
18805 // set, etc.
18806 Lane = SubIdx2Lane(Idx: User->getConstantOperandVal(Num: 1));
18807 if (Lane == ~0u)
18808 return Node;
18809
18810 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
18811 if (UsesTFC && Lane == TFCLane) {
18812 Users[Lane] = User;
18813 } else {
18814 // Set which texture component corresponds to the lane.
18815 unsigned Comp;
18816 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
18817 Comp = llvm::countr_zero(Val: Dmask);
18818 Dmask &= ~(1 << Comp);
18819 }
18820
18821 // Abort if we have more than one user per component.
18822 if (Users[Lane])
18823 return Node;
18824
18825 Users[Lane] = User;
18826 NewDmask |= 1 << Comp;
18827 }
18828 }
18829
18830 // Don't allow 0 dmask, as hardware assumes one channel enabled.
18831 bool NoChannels = !NewDmask;
18832 if (NoChannels) {
18833 if (!UsesTFC) {
18834 // No uses of the result and not using TFC. Then do nothing.
18835 return Node;
18836 }
18837 // If the original dmask has one channel - then nothing to do
18838 if (OldBitsSet == 1)
18839 return Node;
18840 // Use an arbitrary dmask - required for the instruction to work
18841 NewDmask = 1;
18842 }
18843 // Abort if there's no change
18844 if (NewDmask == OldDmask)
18845 return Node;
18846
18847 unsigned BitsSet = llvm::popcount(Value: NewDmask);
18848
18849 // Check for TFE or LWE - increase the number of channels by one to account
18850 // for the extra return value
18851 // This will need adjustment for D16 if this is also included in
18852 // adjustWriteMask (this function) but at present D16 are excluded.
18853 unsigned NewChannels = BitsSet + UsesTFC;
18854
18855 int NewOpcode =
18856 AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
18857 assert(NewOpcode != -1 &&
18858 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
18859 "failed to find equivalent MIMG op");
18860
18861 // Adjust the writemask in the node
18862 SmallVector<SDValue, 12> Ops;
18863 llvm::append_range(C&: Ops, R: Node->ops().take_front(N: DmaskIdx));
18864 Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc(Node), VT: MVT::i32));
18865 llvm::append_range(C&: Ops, R: Node->ops().drop_front(N: DmaskIdx + 1));
18866
18867 MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT();
18868
18869 MVT ResultVT = NewChannels == 1
18870 ? SVT
18871 : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4
18872 : NewChannels == 5 ? 8
18873 : NewChannels);
18874 SDVTList NewVTList =
18875 HasChain ? DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT);
18876
18877 MachineSDNode *NewNode =
18878 DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node), VTs: NewVTList, Ops);
18879
18880 if (HasChain) {
18881 // Update chain.
18882 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
18883 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1));
18884 }
18885
18886 if (NewChannels == 1) {
18887 assert(Node->hasNUsesOfValue(1, 0));
18888 SDNode *Copy =
18889 DAG.getMachineNode(Opcode: TargetOpcode::COPY, dl: SDLoc(Node),
18890 VT: Users[Lane]->getValueType(ResNo: 0), Op1: SDValue(NewNode, 0));
18891 DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
18892 return nullptr;
18893 }
18894
18895 // Update the users of the node with the new indices
18896 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18897 SDNode *User = Users[i];
18898 if (!User) {
18899 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
18900 // Users[0] is still nullptr because channel 0 doesn't really have a use.
18901 if (i || !NoChannels)
18902 continue;
18903 } else {
18904 SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc(User), VT: MVT::i32);
18905 SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op);
18906 if (NewUser != User) {
18907 DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0));
18908 DAG.RemoveDeadNode(N: User);
18909 }
18910 }
18911
18912 switch (Idx) {
18913 default:
18914 break;
18915 case AMDGPU::sub0:
18916 Idx = AMDGPU::sub1;
18917 break;
18918 case AMDGPU::sub1:
18919 Idx = AMDGPU::sub2;
18920 break;
18921 case AMDGPU::sub2:
18922 Idx = AMDGPU::sub3;
18923 break;
18924 case AMDGPU::sub3:
18925 Idx = AMDGPU::sub4;
18926 break;
18927 }
18928 }
18929
18930 DAG.RemoveDeadNode(N: Node);
18931 return nullptr;
18932}
18933
18934static bool isFrameIndexOp(SDValue Op) {
18935 if (Op.getOpcode() == ISD::AssertZext)
18936 Op = Op.getOperand(i: 0);
18937
18938 return isa<FrameIndexSDNode>(Val: Op);
18939}
18940
18941/// Legalize target independent instructions (e.g. INSERT_SUBREG)
18942/// with frame index operands.
18943/// LLVM assumes that inputs are to these instructions are registers.
18944SDNode *
18945SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
18946 SelectionDAG &DAG) const {
18947 if (Node->getOpcode() == ISD::CopyToReg) {
18948 RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1));
18949 SDValue SrcVal = Node->getOperand(Num: 2);
18950
18951 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
18952 // to try understanding copies to physical registers.
18953 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
18954 SDLoc SL(Node);
18955 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
18956 SDValue VReg = DAG.getRegister(
18957 Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1);
18958
18959 SDNode *Glued = Node->getGluedNode();
18960 SDValue ToVReg = DAG.getCopyToReg(
18961 Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal,
18962 Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
18963 SDValue ToResultReg = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0),
18964 N: VReg, Glue: ToVReg.getValue(R: 1));
18965 DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
18966 DAG.RemoveDeadNode(N: Node);
18967 return ToResultReg.getNode();
18968 }
18969 }
18970
18971 SmallVector<SDValue, 8> Ops;
18972 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
18973 if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
18974 Ops.push_back(Elt: Node->getOperand(Num: i));
18975 continue;
18976 }
18977
18978 SDLoc DL(Node);
18979 Ops.push_back(Elt: SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL,
18980 VT: Node->getOperand(Num: i).getValueType(),
18981 Op1: Node->getOperand(Num: i)),
18982 0));
18983 }
18984
18985 return DAG.UpdateNodeOperands(N: Node, Ops);
18986}
18987
18988/// Fold the instructions after selecting them.
18989/// Returns null if users were already updated.
18990SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
18991 SelectionDAG &DAG) const {
18992 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18993 unsigned Opcode = Node->getMachineOpcode();
18994
18995 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
18996 !TII->isGather4(Opcode) &&
18997 AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) {
18998 return adjustWritemask(Node, DAG);
18999 }
19000
19001 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
19002 legalizeTargetIndependentNode(Node, DAG);
19003 return Node;
19004 }
19005
19006 switch (Opcode) {
19007 case AMDGPU::V_DIV_SCALE_F32_e64:
19008 case AMDGPU::V_DIV_SCALE_F64_e64: {
19009 // Satisfy the operand register constraint when one of the inputs is
19010 // undefined. Ordinarily each undef value will have its own implicit_def of
19011 // a vreg, so force these to use a single register.
19012 SDValue Src0 = Node->getOperand(Num: 1);
19013 SDValue Src1 = Node->getOperand(Num: 3);
19014 SDValue Src2 = Node->getOperand(Num: 5);
19015
19016 if ((Src0.isMachineOpcode() &&
19017 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
19018 (Src0 == Src1 || Src0 == Src2))
19019 break;
19020
19021 MVT VT = Src0.getValueType().getSimpleVT();
19022 const TargetRegisterClass *RC =
19023 getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
19024
19025 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
19026 SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
19027
19028 SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node), Reg: UndefReg,
19029 N: Src0, Glue: SDValue());
19030
19031 // src0 must be the same register as src1 or src2, even if the value is
19032 // undefined, so make sure we don't violate this constraint.
19033 if (Src0.isMachineOpcode() &&
19034 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
19035 if (Src1.isMachineOpcode() &&
19036 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
19037 Src0 = Src1;
19038 else if (Src2.isMachineOpcode() &&
19039 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
19040 Src0 = Src2;
19041 else {
19042 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
19043 Src0 = UndefReg;
19044 Src1 = UndefReg;
19045 }
19046 } else
19047 break;
19048
19049 SmallVector<SDValue, 9> Ops(Node->ops());
19050 Ops[1] = Src0;
19051 Ops[3] = Src1;
19052 Ops[5] = Src2;
19053 Ops.push_back(Elt: ImpDef.getValue(R: 1));
19054 return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops);
19055 }
19056 default:
19057 break;
19058 }
19059
19060 return Node;
19061}
19062
19063// Any MIMG instructions that use tfe or lwe require an initialization of the
19064// result register that will be written in the case of a memory access failure.
19065// The required code is also added to tie this init code to the result of the
19066// img instruction.
19067void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
19068 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
19069 const SIRegisterInfo &TRI = TII->getRegisterInfo();
19070 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
19071 MachineBasicBlock &MBB = *MI.getParent();
19072
19073 int DstIdx =
19074 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
19075 unsigned InitIdx = 0;
19076
19077 if (TII->isImage(MI)) {
19078 MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
19079 MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
19080 MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
19081
19082 if (!TFE && !LWE) // intersect_ray
19083 return;
19084
19085 unsigned TFEVal = TFE ? TFE->getImm() : 0;
19086 unsigned LWEVal = LWE ? LWE->getImm() : 0;
19087 unsigned D16Val = D16 ? D16->getImm() : 0;
19088
19089 if (!TFEVal && !LWEVal)
19090 return;
19091
19092 // At least one of TFE or LWE are non-zero
19093 // We have to insert a suitable initialization of the result value and
19094 // tie this to the dest of the image instruction.
19095
19096 // Calculate which dword we have to initialize to 0.
19097 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
19098
19099 // check that dmask operand is found.
19100 assert(MO_Dmask && "Expected dmask operand in instruction");
19101
19102 unsigned dmask = MO_Dmask->getImm();
19103 // Determine the number of active lanes taking into account the
19104 // Gather4 special case
19105 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask);
19106
19107 bool Packed = !Subtarget->hasUnpackedD16VMem();
19108
19109 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
19110
19111 // Abandon attempt if the dst size isn't large enough
19112 // - this is in fact an error but this is picked up elsewhere and
19113 // reported correctly.
19114 const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
19115
19116 uint32_t DstSize = TRI.getRegSizeInBits(RC: *DstRC) / 32;
19117 if (DstSize < InitIdx)
19118 return;
19119 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
19120 const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
19121 InitIdx = TRI.getRegSizeInBits(RC: *DstRC) / 32;
19122 } else {
19123 return;
19124 }
19125
19126 const DebugLoc &DL = MI.getDebugLoc();
19127
19128 // Create a register for the initialization value.
19129 Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
19130 unsigned NewDst = 0; // Final initialized value will be in here
19131
19132 // If PRTStrictNull feature is enabled (the default) then initialize
19133 // all the result registers to 0, otherwise just the error indication
19134 // register (VGPRn+1)
19135 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
19136 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
19137
19138 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst);
19139 for (; SizeLeft; SizeLeft--, CurrIdx++) {
19140 NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
19141 // Initialize dword
19142 Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
19143 // clang-format off
19144 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg)
19145 .addImm(Val: 0);
19146 // clang-format on
19147 // Insert into the super-reg
19148 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst)
19149 .addReg(RegNo: PrevDst)
19150 .addReg(RegNo: SubReg)
19151 .addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
19152
19153 PrevDst = NewDst;
19154 }
19155
19156 // Add as an implicit operand
19157 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
19158
19159 // Tie the just added implicit operand to the dst
19160 MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1);
19161}
19162
19163/// Assign the register class depending on the number of
19164/// bits set in the writemask
19165void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
19166 SDNode *Node) const {
19167 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
19168
19169 MachineFunction *MF = MI.getMF();
19170 MachineRegisterInfo &MRI = MF->getRegInfo();
19171
19172 if (TII->isVOP3(Opcode: MI.getOpcode())) {
19173 // Make sure constant bus requirements are respected.
19174 TII->legalizeOperandsVOP3(MRI, MI);
19175
19176 if (TII->isMAI(MI)) {
19177 // The ordinary src0, src1, src2 were legalized above.
19178 //
19179 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
19180 // as a separate instruction.
19181 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
19182 Name: AMDGPU::OpName::scale_src0);
19183 if (Src0Idx != -1) {
19184 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
19185 Name: AMDGPU::OpName::scale_src1);
19186 if (TII->usesConstantBus(MRI, MI, OpIdx: Src0Idx) &&
19187 TII->usesConstantBus(MRI, MI, OpIdx: Src1Idx))
19188 TII->legalizeOpWithMove(MI, OpIdx: Src1Idx);
19189 }
19190 }
19191
19192 return;
19193 }
19194
19195 if (TII->isImage(MI))
19196 TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr);
19197}
19198
19199static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
19200 uint64_t Val) {
19201 SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32);
19202 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), 0);
19203}
19204
19205MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
19206 const SDLoc &DL,
19207 SDValue Ptr) const {
19208 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
19209
19210 // Build the half of the subregister with the constants before building the
19211 // full 128-bit register. If we are building multiple resource descriptors,
19212 // this will allow CSEing of the 2-component register.
19213 const SDValue Ops0[] = {
19214 DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32),
19215 buildSMovImm32(DAG, DL, Val: 0),
19216 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
19217 buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> 32),
19218 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
19219
19220 SDValue SubRegHi = SDValue(
19221 DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v2i32, Ops: Ops0), 0);
19222
19223 // Combine the constants and the pointer.
19224 const SDValue Ops1[] = {
19225 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), Ptr,
19226 DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32), SubRegHi,
19227 DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32)};
19228
19229 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1);
19230}
19231
19232/// Return a resource descriptor with the 'Add TID' bit enabled
19233/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
19234/// of the resource descriptor) to create an offset, which is added to
19235/// the resource pointer.
19236MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
19237 SDValue Ptr, uint32_t RsrcDword1,
19238 uint64_t RsrcDword2And3) const {
19239 SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr);
19240 SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr);
19241 if (RsrcDword1) {
19242 PtrHi =
19243 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi,
19244 Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)),
19245 0);
19246 }
19247
19248 SDValue DataLo =
19249 buildSMovImm32(DAG, DL, Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
19250 SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32);
19251
19252 const SDValue Ops[] = {
19253 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
19254 PtrLo,
19255 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
19256 PtrHi,
19257 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32),
19258 DataLo,
19259 DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32),
19260 DataHi,
19261 DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32)};
19262
19263 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops);
19264}
19265
19266//===----------------------------------------------------------------------===//
19267// SI Inline Assembly Support
19268//===----------------------------------------------------------------------===//
19269
19270std::pair<unsigned, const TargetRegisterClass *>
19271SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
19272 StringRef Constraint,
19273 MVT VT) const {
19274 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
19275
19276 const TargetRegisterClass *RC = nullptr;
19277 if (Constraint.size() == 1) {
19278 // Check if we cannot determine the bit size of the given value type. This
19279 // can happen, for example, in this situation where we have an empty struct
19280 // (size 0): `call void asm "", "v"({} poison)`-
19281 if (VT == MVT::Other)
19282 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19283 const unsigned BitWidth = VT.getSizeInBits();
19284 switch (Constraint[0]) {
19285 default:
19286 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19287 case 's':
19288 case 'r':
19289 switch (BitWidth) {
19290 case 16:
19291 RC = &AMDGPU::SReg_32RegClass;
19292 break;
19293 case 64:
19294 RC = &AMDGPU::SGPR_64RegClass;
19295 break;
19296 default:
19297 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
19298 if (!RC)
19299 return std::pair(0U, nullptr);
19300 break;
19301 }
19302 break;
19303 case 'v':
19304 switch (BitWidth) {
19305 case 1:
19306 return std::pair(0U, nullptr);
19307 case 16:
19308 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
19309 : &AMDGPU::VGPR_32_Lo256RegClass;
19310 break;
19311 default:
19312 RC = Subtarget->has1024AddressableVGPRs()
19313 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
19314 : TRI->getVGPRClassForBitWidth(BitWidth);
19315 if (!RC)
19316 return std::pair(0U, nullptr);
19317 break;
19318 }
19319 break;
19320 case 'a':
19321 if (!Subtarget->hasMAIInsts())
19322 break;
19323 switch (BitWidth) {
19324 case 1:
19325 return std::pair(0U, nullptr);
19326 case 16:
19327 RC = &AMDGPU::AGPR_32RegClass;
19328 break;
19329 default:
19330 RC = TRI->getAGPRClassForBitWidth(BitWidth);
19331 if (!RC)
19332 return std::pair(0U, nullptr);
19333 break;
19334 }
19335 break;
19336 }
19337 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
19338 const unsigned BitWidth = VT.getSizeInBits();
19339 switch (BitWidth) {
19340 case 16:
19341 RC = &AMDGPU::AV_32RegClass;
19342 break;
19343 default:
19344 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
19345 if (!RC)
19346 return std::pair(0U, nullptr);
19347 break;
19348 }
19349 }
19350
19351 // We actually support i128, i16 and f16 as inline parameters
19352 // even if they are not reported as legal
19353 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
19354 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
19355 return std::pair(0U, RC);
19356
19357 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
19358 if (Kind != '\0') {
19359 if (Kind == 'v') {
19360 RC = &AMDGPU::VGPR_32_Lo256RegClass;
19361 } else if (Kind == 's') {
19362 RC = &AMDGPU::SGPR_32RegClass;
19363 } else if (Kind == 'a') {
19364 RC = &AMDGPU::AGPR_32RegClass;
19365 }
19366
19367 if (RC) {
19368 if (NumRegs > 1) {
19369 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
19370 return std::pair(0U, nullptr);
19371
19372 uint32_t Width = NumRegs * 32;
19373 // Prohibit constraints for register ranges with a width that does not
19374 // match the required type.
19375 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
19376 return std::pair(0U, nullptr);
19377
19378 MCRegister Reg = RC->getRegister(i: Idx);
19379 if (SIRegisterInfo::isVGPRClass(RC))
19380 RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
19381 else if (SIRegisterInfo::isSGPRClass(RC))
19382 RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
19383 else if (SIRegisterInfo::isAGPRClass(RC))
19384 RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
19385 if (RC) {
19386 Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC);
19387 if (!Reg) {
19388 // The register class does not contain the requested register,
19389 // e.g., because it is an SGPR pair that would violate alignment
19390 // requirements.
19391 return std::pair(0U, nullptr);
19392 }
19393 return std::pair(Reg, RC);
19394 }
19395 }
19396
19397 // Reject types that do not fit a single 32-bit register: any scalar wider
19398 // than 32 bits, or a vector that is not exactly 32 bits.
19399 if (VT.SimpleTy != MVT::Other &&
19400 (VT.getSizeInBits() > 32 ||
19401 (VT.isVector() && VT.getSizeInBits() != 32)))
19402 return std::pair(0U, nullptr);
19403 if (RC && Idx < RC->getNumRegs())
19404 return std::pair(RC->getRegister(i: Idx), RC);
19405 return std::pair(0U, nullptr);
19406 }
19407 }
19408
19409 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19410 if (Ret.first)
19411 Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first);
19412
19413 return Ret;
19414}
19415
19416static bool isImmConstraint(StringRef Constraint) {
19417 if (Constraint.size() == 1) {
19418 switch (Constraint[0]) {
19419 default:
19420 break;
19421 case 'I':
19422 case 'J':
19423 case 'A':
19424 case 'B':
19425 case 'C':
19426 return true;
19427 }
19428 } else if (Constraint == "DA" || Constraint == "DB") {
19429 return true;
19430 }
19431 return false;
19432}
19433
19434SITargetLowering::ConstraintType
19435SITargetLowering::getConstraintType(StringRef Constraint) const {
19436 if (Constraint.size() == 1) {
19437 switch (Constraint[0]) {
19438 default:
19439 break;
19440 case 's':
19441 case 'v':
19442 case 'a':
19443 return C_RegisterClass;
19444 }
19445 } else if (Constraint.size() == 2) {
19446 if (Constraint == "VA")
19447 return C_RegisterClass;
19448 }
19449 if (isImmConstraint(Constraint)) {
19450 return C_Other;
19451 }
19452 return TargetLowering::getConstraintType(Constraint);
19453}
19454
19455static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
19456 if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
19457 Val = Val & maskTrailingOnes<uint64_t>(N: Size);
19458 }
19459 return Val;
19460}
19461
19462void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
19463 StringRef Constraint,
19464 std::vector<SDValue> &Ops,
19465 SelectionDAG &DAG) const {
19466 if (isImmConstraint(Constraint)) {
19467 uint64_t Val;
19468 if (getAsmOperandConstVal(Op, Val) &&
19469 checkAsmConstraintVal(Op, Constraint, Val)) {
19470 Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
19471 Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc(Op), VT: MVT::i64));
19472 }
19473 } else {
19474 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
19475 }
19476}
19477
19478bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
19479 unsigned Size = Op.getScalarValueSizeInBits();
19480 if (Size > 64)
19481 return false;
19482
19483 if (Size == 16 && !Subtarget->has16BitInsts())
19484 return false;
19485
19486 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
19487 Val = C->getSExtValue();
19488 return true;
19489 }
19490 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
19491 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
19492 return true;
19493 }
19494 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
19495 if (Size != 16 || Op.getNumOperands() != 2)
19496 return false;
19497 if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef())
19498 return false;
19499 if (ConstantSDNode *C = V->getConstantSplatNode()) {
19500 Val = C->getSExtValue();
19501 return true;
19502 }
19503 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
19504 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
19505 return true;
19506 }
19507 }
19508
19509 return false;
19510}
19511
19512bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
19513 uint64_t Val) const {
19514 if (Constraint.size() == 1) {
19515 switch (Constraint[0]) {
19516 case 'I':
19517 return AMDGPU::isInlinableIntLiteral(Literal: Val);
19518 case 'J':
19519 return isInt<16>(x: Val);
19520 case 'A':
19521 return checkAsmConstraintValA(Op, Val);
19522 case 'B':
19523 return isInt<32>(x: Val);
19524 case 'C':
19525 return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) ||
19526 AMDGPU::isInlinableIntLiteral(Literal: Val);
19527 default:
19528 break;
19529 }
19530 } else if (Constraint.size() == 2) {
19531 if (Constraint == "DA") {
19532 int64_t HiBits = static_cast<int32_t>(Val >> 32);
19533 int64_t LoBits = static_cast<int32_t>(Val);
19534 return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) &&
19535 checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32);
19536 }
19537 if (Constraint == "DB") {
19538 return true;
19539 }
19540 }
19541 llvm_unreachable("Invalid asm constraint");
19542}
19543
19544bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
19545 unsigned MaxSize) const {
19546 unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
19547 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
19548 if (Size == 16) {
19549 MVT VT = Op.getSimpleValueType();
19550 switch (VT.SimpleTy) {
19551 default:
19552 return false;
19553 case MVT::i16:
19554 return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
19555 case MVT::f16:
19556 return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
19557 case MVT::bf16:
19558 return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
19559 case MVT::v2i16:
19560 return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
19561 case MVT::v2f16:
19562 return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
19563 case MVT::v2bf16:
19564 return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
19565 }
19566 }
19567 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) ||
19568 (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
19569 return true;
19570 return false;
19571}
19572
19573static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
19574 switch (UnalignedClassID) {
19575 case AMDGPU::VReg_64RegClassID:
19576 return AMDGPU::VReg_64_Align2RegClassID;
19577 case AMDGPU::VReg_96RegClassID:
19578 return AMDGPU::VReg_96_Align2RegClassID;
19579 case AMDGPU::VReg_128RegClassID:
19580 return AMDGPU::VReg_128_Align2RegClassID;
19581 case AMDGPU::VReg_160RegClassID:
19582 return AMDGPU::VReg_160_Align2RegClassID;
19583 case AMDGPU::VReg_192RegClassID:
19584 return AMDGPU::VReg_192_Align2RegClassID;
19585 case AMDGPU::VReg_224RegClassID:
19586 return AMDGPU::VReg_224_Align2RegClassID;
19587 case AMDGPU::VReg_256RegClassID:
19588 return AMDGPU::VReg_256_Align2RegClassID;
19589 case AMDGPU::VReg_288RegClassID:
19590 return AMDGPU::VReg_288_Align2RegClassID;
19591 case AMDGPU::VReg_320RegClassID:
19592 return AMDGPU::VReg_320_Align2RegClassID;
19593 case AMDGPU::VReg_352RegClassID:
19594 return AMDGPU::VReg_352_Align2RegClassID;
19595 case AMDGPU::VReg_384RegClassID:
19596 return AMDGPU::VReg_384_Align2RegClassID;
19597 case AMDGPU::VReg_512RegClassID:
19598 return AMDGPU::VReg_512_Align2RegClassID;
19599 case AMDGPU::VReg_1024RegClassID:
19600 return AMDGPU::VReg_1024_Align2RegClassID;
19601 case AMDGPU::AReg_64RegClassID:
19602 return AMDGPU::AReg_64_Align2RegClassID;
19603 case AMDGPU::AReg_96RegClassID:
19604 return AMDGPU::AReg_96_Align2RegClassID;
19605 case AMDGPU::AReg_128RegClassID:
19606 return AMDGPU::AReg_128_Align2RegClassID;
19607 case AMDGPU::AReg_160RegClassID:
19608 return AMDGPU::AReg_160_Align2RegClassID;
19609 case AMDGPU::AReg_192RegClassID:
19610 return AMDGPU::AReg_192_Align2RegClassID;
19611 case AMDGPU::AReg_256RegClassID:
19612 return AMDGPU::AReg_256_Align2RegClassID;
19613 case AMDGPU::AReg_512RegClassID:
19614 return AMDGPU::AReg_512_Align2RegClassID;
19615 case AMDGPU::AReg_1024RegClassID:
19616 return AMDGPU::AReg_1024_Align2RegClassID;
19617 default:
19618 return -1;
19619 }
19620}
19621
19622// Figure out which registers should be reserved for stack access. Only after
19623// the function is legalized do we know all of the non-spill stack objects or if
19624// calls are present.
19625void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
19626 MachineRegisterInfo &MRI = MF.getRegInfo();
19627 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
19628 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
19629 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19630 const SIInstrInfo *TII = ST.getInstrInfo();
19631
19632 if (Info->isEntryFunction()) {
19633 // Callable functions have fixed registers used for stack access.
19634 reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info);
19635 }
19636
19637 // TODO: Move this logic to getReservedRegs()
19638 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
19639 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
19640 Register SReg = ST.isWave32()
19641 ? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - 1)
19642 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
19643 RC: &AMDGPU::SGPR_64RegClass);
19644 Info->setSGPRForEXECCopy(SReg);
19645
19646 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
19647 Info->getStackPtrOffsetReg()));
19648 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
19649 MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg());
19650
19651 // We need to worry about replacing the default register with itself in case
19652 // of MIR testcases missing the MFI.
19653 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
19654 MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg());
19655
19656 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
19657 MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg());
19658
19659 Info->limitOccupancy(MF);
19660
19661 if (ST.isWave32() && !MF.empty()) {
19662 for (auto &MBB : MF) {
19663 for (auto &MI : MBB) {
19664 TII->fixImplicitOperands(MI);
19665 }
19666 }
19667 }
19668
19669 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
19670 // classes if required. Ideally the register class constraints would differ
19671 // per-subtarget, but there's no easy way to achieve that right now. This is
19672 // not a problem for VGPRs because the correctly aligned VGPR class is implied
19673 // from using them as the register class for legal types.
19674 if (ST.needsAlignedVGPRs()) {
19675 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
19676 const Register Reg = Register::index2VirtReg(Index: I);
19677 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
19678 if (!RC)
19679 continue;
19680 int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
19681 if (NewClassID != -1)
19682 MRI.setRegClass(Reg, RC: TRI->getRegClass(i: NewClassID));
19683 }
19684 }
19685
19686 TargetLoweringBase::finalizeLowering(MF);
19687}
19688
19689void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
19690 KnownBits &Known,
19691 const APInt &DemandedElts,
19692 const SelectionDAG &DAG,
19693 unsigned Depth) const {
19694 Known.resetAll();
19695 unsigned Opc = Op.getOpcode();
19696 switch (Opc) {
19697 case ISD::INTRINSIC_WO_CHAIN: {
19698 unsigned IID = Op.getConstantOperandVal(i: 0);
19699 switch (IID) {
19700 case Intrinsic::amdgcn_mbcnt_lo:
19701 case Intrinsic::amdgcn_mbcnt_hi: {
19702 const GCNSubtarget &ST =
19703 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
19704 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
19705 // most 31 + src1.
19706 Known.Zero.setBitsFrom(
19707 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
19708 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
19709 Known = KnownBits::add(LHS: Known, RHS: Known2);
19710 return;
19711 }
19712 }
19713 break;
19714 }
19715 }
19716 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
19717 Op, Known, DemandedElts, DAG, Depth);
19718}
19719
19720void SITargetLowering::computeKnownBitsForFrameIndex(
19721 const int FI, KnownBits &Known, const MachineFunction &MF) const {
19722 TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
19723
19724 // Set the high bits to zero based on the maximum allowed scratch size per
19725 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
19726 // calculation won't overflow, so assume the sign bit is never set.
19727 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
19728}
19729
19730static void knownBitsForWorkitemID(const GCNSubtarget &ST,
19731 GISelValueTracking &VT, KnownBits &Known,
19732 unsigned Dim) {
19733 unsigned MaxValue =
19734 ST.getMaxWorkitemID(Kernel: VT.getMachineFunction().getFunction(), Dimension: Dim);
19735 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
19736}
19737
19738static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,
19739 KnownBits &Known, const APInt &DemandedElts,
19740 unsigned BFEWidth, bool SExt, unsigned Depth) {
19741 const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();
19742 const MachineOperand &Src1 = MI.getOperand(i: 2);
19743
19744 unsigned Src1Cst = 0;
19745 if (Src1.isImm()) {
19746 Src1Cst = Src1.getImm();
19747 } else if (Src1.isReg()) {
19748 auto Cst = getIConstantVRegValWithLookThrough(VReg: Src1.getReg(), MRI);
19749 if (!Cst)
19750 return;
19751 Src1Cst = Cst->Value.getZExtValue();
19752 } else {
19753 return;
19754 }
19755
19756 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
19757 // Width is always [22:16].
19758 const unsigned Offset =
19759 Src1Cst & maskTrailingOnes<unsigned>(N: (BFEWidth == 32) ? 5 : 6);
19760 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(N: 6);
19761
19762 if (Width >= BFEWidth) // Ill-formed.
19763 return;
19764
19765 VT.computeKnownBitsImpl(R: MI.getOperand(i: 1).getReg(), Known, DemandedElts,
19766 Depth: Depth + 1);
19767
19768 Known = Known.extractBits(NumBits: Width, BitPosition: Offset);
19769
19770 if (SExt)
19771 Known = Known.sext(BitWidth: BFEWidth);
19772 else
19773 Known = Known.zext(BitWidth: BFEWidth);
19774}
19775
19776void SITargetLowering::computeKnownBitsForTargetInstr(
19777 GISelValueTracking &VT, Register R, KnownBits &Known,
19778 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
19779 unsigned Depth) const {
19780 Known.resetAll();
19781 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
19782 switch (MI->getOpcode()) {
19783 case AMDGPU::S_BFE_I32:
19784 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 32,
19785 /*SExt=*/true, Depth);
19786 case AMDGPU::S_BFE_U32:
19787 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 32,
19788 /*SExt=*/false, Depth);
19789 case AMDGPU::S_BFE_I64:
19790 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 64,
19791 /*SExt=*/true, Depth);
19792 case AMDGPU::S_BFE_U64:
19793 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 64,
19794 /*SExt=*/false, Depth);
19795 case AMDGPU::G_INTRINSIC:
19796 case AMDGPU::G_INTRINSIC_CONVERGENT: {
19797 Intrinsic::ID IID = cast<GIntrinsic>(Val: MI)->getIntrinsicID();
19798 switch (IID) {
19799 case Intrinsic::amdgcn_workitem_id_x:
19800 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 0);
19801 break;
19802 case Intrinsic::amdgcn_workitem_id_y:
19803 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 1);
19804 break;
19805 case Intrinsic::amdgcn_workitem_id_z:
19806 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 2);
19807 break;
19808 case Intrinsic::amdgcn_mbcnt_lo:
19809 case Intrinsic::amdgcn_mbcnt_hi: {
19810 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
19811 // most 31 + src1.
19812 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
19813 ? getSubtarget()->getWavefrontSizeLog2()
19814 : 5);
19815 KnownBits Known2;
19816 VT.computeKnownBitsImpl(R: MI->getOperand(i: 3).getReg(), Known&: Known2, DemandedElts,
19817 Depth: Depth + 1);
19818 Known = KnownBits::add(LHS: Known, RHS: Known2);
19819 break;
19820 }
19821 case Intrinsic::amdgcn_groupstaticsize: {
19822 // We can report everything over the maximum size as 0. We can't report
19823 // based on the actual size because we don't know if it's accurate or not
19824 // at any given point.
19825 Known.Zero.setHighBits(
19826 llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize()));
19827 break;
19828 }
19829 }
19830 break;
19831 }
19832 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
19833 Known.Zero.setHighBits(24);
19834 break;
19835 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
19836 Known.Zero.setHighBits(16);
19837 break;
19838 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
19839 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
19840 // producing exactly 0 or 1.
19841 Known.Zero.setHighBits(Known.getBitWidth() - 1);
19842 break;
19843 case AMDGPU::G_AMDGPU_SMED3:
19844 case AMDGPU::G_AMDGPU_UMED3: {
19845 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
19846
19847 KnownBits Known2;
19848 VT.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1);
19849 if (Known2.isUnknown())
19850 break;
19851
19852 KnownBits Known1;
19853 VT.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1);
19854 if (Known1.isUnknown())
19855 break;
19856
19857 KnownBits Known0;
19858 VT.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1);
19859 if (Known0.isUnknown())
19860 break;
19861
19862 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
19863 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
19864 Known.One = Known0.One & Known1.One & Known2.One;
19865 break;
19866 }
19867 }
19868}
19869
19870Align SITargetLowering::computeKnownAlignForTargetInstr(
19871 GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
19872 unsigned Depth) const {
19873 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
19874 if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
19875 // FIXME: Can this move to generic code? What about the case where the call
19876 // site specifies a lower alignment?
19877 Intrinsic::ID IID = GI->getIntrinsicID();
19878 LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
19879 AttributeList Attrs =
19880 Intrinsic::getAttributes(C&: Ctx, id: IID, FT: Intrinsic::getType(Context&: Ctx, id: IID));
19881 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
19882 return *RetAlign;
19883 }
19884 return Align(1);
19885}
19886
19887Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
19888 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
19889 const Align CacheLineAlign = Align(64);
19890
19891 // GFX950: Prevent an 8-byte instruction at loop header from being split by
19892 // the 32-byte instruction fetch window boundary. This avoids a significant
19893 // fetch delay after backward branch. We use 32-byte alignment with max
19894 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
19895 if (ML && !DisableLoopAlignment &&
19896 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
19897 const MachineBasicBlock *Header = ML->getHeader();
19898 // Respect user-specified or previously set alignment.
19899 if (Header->getAlignment() != PrefAlign)
19900 return Header->getAlignment();
19901 if (needsFetchWindowAlignment(MBB: *Header))
19902 return Align(32);
19903 }
19904
19905 // Pre-GFX10 target did not benefit from loop alignment
19906 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
19907 getSubtarget()->hasInstFwdPrefetchBug())
19908 return PrefAlign;
19909
19910 // On GFX10 I$ is 4 x 64 bytes cache lines.
19911 // By default prefetcher keeps one cache line behind and reads two ahead.
19912 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
19913 // behind and one ahead.
19914 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
19915 // If loop fits 64 bytes it always spans no more than two cache lines and
19916 // does not need an alignment.
19917 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
19918 // Else if loop is less or equal 192 bytes we need two lines behind.
19919
19920 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
19921 const MachineBasicBlock *Header = ML->getHeader();
19922 if (Header->getAlignment() != PrefAlign)
19923 return Header->getAlignment(); // Already processed.
19924
19925 unsigned LoopSize = 0;
19926 for (const MachineBasicBlock *MBB : ML->blocks()) {
19927 // If inner loop block is aligned assume in average half of the alignment
19928 // size to be added as nops.
19929 if (MBB != Header)
19930 LoopSize += MBB->getAlignment().value() / 2;
19931
19932 for (const MachineInstr &MI : *MBB) {
19933 LoopSize += TII->getInstSizeInBytes(MI);
19934 if (LoopSize > 192)
19935 return PrefAlign;
19936 }
19937 }
19938
19939 if (LoopSize <= 64)
19940 return PrefAlign;
19941
19942 if (LoopSize <= 128)
19943 return CacheLineAlign;
19944
19945 // If any of parent loops is surrounded by prefetch instructions do not
19946 // insert new for inner loop, which would reset parent's settings.
19947 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
19948 if (MachineBasicBlock *Exit = P->getExitBlock()) {
19949 auto I = Exit->getFirstNonDebugInstr();
19950 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19951 return CacheLineAlign;
19952 }
19953 }
19954
19955 MachineBasicBlock *Pre = ML->getLoopPreheader();
19956 MachineBasicBlock *Exit = ML->getExitBlock();
19957
19958 if (Pre && Exit) {
19959 auto PreTerm = Pre->getFirstTerminator();
19960 if (PreTerm == Pre->begin() ||
19961 std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19962 BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
19963 .addImm(Val: 1); // prefetch 2 lines behind PC
19964
19965 auto ExitHead = Exit->getFirstNonDebugInstr();
19966 if (ExitHead == Exit->end() ||
19967 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19968 BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
19969 .addImm(Val: 2); // prefetch 1 line behind PC
19970 }
19971
19972 return CacheLineAlign;
19973}
19974
19975unsigned SITargetLowering::getMaxPermittedBytesForAlignment(
19976 MachineBasicBlock *MBB) const {
19977 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
19978 // instruction could be split by the 32-byte fetch window boundary.
19979 // See getPrefLoopAlignment() for context.
19980 if (needsFetchWindowAlignment(MBB: *MBB))
19981 return 4;
19982 return TargetLowering::getMaxPermittedBytesForAlignment(MBB);
19983}
19984
19985bool SITargetLowering::needsFetchWindowAlignment(
19986 const MachineBasicBlock &MBB) const {
19987 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
19988 return false;
19989 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
19990 for (const MachineInstr &MI : MBB) {
19991 if (MI.isMetaInstruction())
19992 continue;
19993 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
19994 return TII->getInstSizeInBytes(MI) > 4;
19995 }
19996 return false;
19997}
19998
19999[[maybe_unused]]
20000static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
20001 assert(N->getOpcode() == ISD::CopyFromReg);
20002 do {
20003 // Follow the chain until we find an INLINEASM node.
20004 N = N->getOperand(Num: 0).getNode();
20005 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
20006 return true;
20007 } while (N->getOpcode() == ISD::CopyFromReg);
20008 return false;
20009}
20010
20011bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
20012 FunctionLoweringInfo *FLI,
20013 UniformityInfo *UA) const {
20014 switch (N->getOpcode()) {
20015 case ISD::CopyFromReg: {
20016 const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1));
20017 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
20018 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
20019 Register Reg = R->getReg();
20020
20021 // FIXME: Why does this need to consider isLiveIn?
20022 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
20023 return !TRI->isSGPRReg(MRI, Reg);
20024
20025 if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
20026 return UA->isDivergentAtDef(V);
20027
20028 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
20029 return !TRI->isSGPRReg(MRI, Reg);
20030 }
20031 case ISD::LOAD: {
20032 const LoadSDNode *L = cast<LoadSDNode>(Val: N);
20033 unsigned AS = L->getAddressSpace();
20034 // A flat load may access private memory.
20035 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
20036 }
20037 case ISD::CALLSEQ_END:
20038 return true;
20039 case ISD::INTRINSIC_WO_CHAIN:
20040 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0));
20041 case ISD::INTRINSIC_W_CHAIN:
20042 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1));
20043 case AMDGPUISD::ATOMIC_CMP_SWAP:
20044 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
20045 case AMDGPUISD::BUFFER_ATOMIC_ADD:
20046 case AMDGPUISD::BUFFER_ATOMIC_SUB:
20047 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
20048 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
20049 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
20050 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
20051 case AMDGPUISD::BUFFER_ATOMIC_AND:
20052 case AMDGPUISD::BUFFER_ATOMIC_OR:
20053 case AMDGPUISD::BUFFER_ATOMIC_XOR:
20054 case AMDGPUISD::BUFFER_ATOMIC_INC:
20055 case AMDGPUISD::BUFFER_ATOMIC_DEC:
20056 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
20057 case AMDGPUISD::BUFFER_ATOMIC_FADD:
20058 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
20059 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
20060 // Target-specific read-modify-write atomics are sources of divergence.
20061 return true;
20062 default:
20063 if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
20064 // Generic read-modify-write atomics are sources of divergence.
20065 return A->readMem() && A->writeMem();
20066 }
20067 return false;
20068 }
20069}
20070
20071bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
20072 EVT VT) const {
20073 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
20074 case MVT::f32:
20075 return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
20076 case MVT::f64:
20077 case MVT::f16:
20078 return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
20079 default:
20080 return false;
20081 }
20082}
20083
20084bool SITargetLowering::denormalsEnabledForType(
20085 LLT Ty, const MachineFunction &MF) const {
20086 switch (Ty.getScalarSizeInBits()) {
20087 case 32:
20088 return !denormalModeIsFlushAllF32(MF);
20089 case 64:
20090 case 16:
20091 return !denormalModeIsFlushAllF64F16(MF);
20092 default:
20093 return false;
20094 }
20095}
20096
20097bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
20098 const APInt &DemandedElts,
20099 const SelectionDAG &DAG,
20100 bool SNaN,
20101 unsigned Depth) const {
20102 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
20103 const MachineFunction &MF = DAG.getMachineFunction();
20104 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
20105
20106 if (Info->getMode().DX10Clamp)
20107 return true; // Clamped to 0.
20108 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
20109 }
20110
20111 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
20112 DAG, SNaN, Depth);
20113}
20114
20115// On older subtargets, global FP atomic instructions have a hardcoded FP mode
20116// and do not support FP32 denormals, and only support v2f16/f64 denormals.
20117static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
20118 if (RMW->hasMetadata(Kind: "amdgpu.ignore.denormal.mode"))
20119 return true;
20120
20121 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
20122 auto DenormMode = RMW->getFunction()->getDenormalMode(FPType: Flt);
20123 if (DenormMode == DenormalMode::getPreserveSign())
20124 return true;
20125
20126 // TODO: Remove this.
20127 return RMW->getFunction()
20128 ->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics")
20129 .getValueAsBool();
20130}
20131
20132static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
20133 LLVMContext &Ctx = RMW->getContext();
20134 StringRef MemScope =
20135 Ctx.getSyncScopeName(Id: RMW->getSyncScopeID()).value_or(u: "system");
20136
20137 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
20138 << "Hardware instruction generated for atomic "
20139 << RMW->getOperationName(Op: RMW->getOperation())
20140 << " operation at memory scope " << MemScope;
20141}
20142
20143static bool isV2F16OrV2BF16(Type *Ty) {
20144 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
20145 Type *EltTy = VT->getElementType();
20146 return VT->getNumElements() == 2 &&
20147 (EltTy->isHalfTy() || EltTy->isBFloatTy());
20148 }
20149
20150 return false;
20151}
20152
20153static bool isV2F16(Type *Ty) {
20154 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
20155 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
20156}
20157
20158static bool isV2BF16(Type *Ty) {
20159 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
20160 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
20161}
20162
20163/// \return true if atomicrmw integer ops work for the type.
20164static bool isAtomicRMWLegalIntTy(Type *Ty) {
20165 if (auto *IT = dyn_cast<IntegerType>(Val: Ty)) {
20166 unsigned BW = IT->getBitWidth();
20167 return BW == 32 || BW == 64;
20168 }
20169
20170 return false;
20171}
20172
20173/// \return true if this atomicrmw xchg type can be selected.
20174static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
20175 Type *Ty = RMW->getType();
20176 if (isAtomicRMWLegalIntTy(Ty))
20177 return true;
20178
20179 if (PointerType *PT = dyn_cast<PointerType>(Val: Ty)) {
20180 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
20181 unsigned BW = DL.getPointerSizeInBits(AS: PT->getAddressSpace());
20182 return BW == 32 || BW == 64;
20183 }
20184
20185 if (Ty->isFloatTy() || Ty->isDoubleTy())
20186 return true;
20187
20188 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
20189 return VT->getNumElements() == 2 &&
20190 VT->getElementType()->getPrimitiveSizeInBits() == 16;
20191 }
20192
20193 return false;
20194}
20195
20196/// \returns true if it's valid to emit a native instruction for \p RMW, based
20197/// on the properties of the target memory.
20198static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
20199 const AtomicRMWInst *RMW,
20200 bool HasSystemScope) {
20201 // The remote/fine-grained access logic is different from the integer
20202 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
20203 // fine-grained access does not work, even for a device local allocation.
20204 //
20205 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
20206 // allocations work.
20207 if (HasSystemScope) {
20208 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
20209 RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
20210 return true;
20211 if (Subtarget.hasEmulatedSystemScopeAtomics())
20212 return true;
20213 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
20214 return true;
20215
20216 return RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory");
20217}
20218
20219/// \return Action to perform on AtomicRMWInsts for integer operations.
20220static TargetLowering::AtomicExpansionKind
20221atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
20222 return isAtomicRMWLegalIntTy(Ty: RMW->getType())
20223 ? TargetLowering::AtomicExpansionKind::None
20224 : TargetLowering::AtomicExpansionKind::CmpXChg;
20225}
20226
20227/// Return if a flat address space atomicrmw can access private memory.
20228static bool flatInstrMayAccessPrivate(const Instruction *I) {
20229 const MDNode *MD = I->getMetadata(KindID: LLVMContext::MD_noalias_addrspace);
20230 return !MD ||
20231 !AMDGPU::hasValueInRangeLikeMetadata(MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
20232}
20233
20234static TargetLowering::AtomicExpansionKind
20235getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
20236 // For GAS, lower to flat atomic.
20237 return STI.hasGloballyAddressableScratch()
20238 ? TargetLowering::AtomicExpansionKind::CustomExpand
20239 : TargetLowering::AtomicExpansionKind::NotAtomic;
20240}
20241
20242TargetLowering::AtomicExpansionKind
20243SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
20244 unsigned AS = RMW->getPointerAddressSpace();
20245 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
20246 return getPrivateAtomicExpansionKind(STI: *getSubtarget());
20247
20248 // 64-bit flat atomics that dynamically reside in private memory will silently
20249 // be dropped.
20250 //
20251 // Note that we will emit a new copy of the original atomic in the expansion,
20252 // which will be incrementally relegalized.
20253 const DataLayout &DL = RMW->getFunction()->getDataLayout();
20254 if (AS == AMDGPUAS::FLAT_ADDRESS &&
20255 DL.getTypeSizeInBits(Ty: RMW->getType()) == 64 &&
20256 flatInstrMayAccessPrivate(I: RMW))
20257 return AtomicExpansionKind::CustomExpand;
20258
20259 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
20260 OptimizationRemarkEmitter ORE(RMW->getFunction());
20261 ORE.emit(RemarkBuilder: [=]() {
20262 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
20263 });
20264 return Kind;
20265 };
20266
20267 auto SSID = RMW->getSyncScopeID();
20268 bool HasSystemScope =
20269 SSID == SyncScope::System ||
20270 SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
20271
20272 auto Op = RMW->getOperation();
20273 switch (Op) {
20274 case AtomicRMWInst::Xchg:
20275 // PCIe supports add and xchg for system atomics.
20276 return isAtomicRMWLegalXChgTy(RMW)
20277 ? TargetLowering::AtomicExpansionKind::None
20278 : TargetLowering::AtomicExpansionKind::CmpXChg;
20279 case AtomicRMWInst::Add:
20280 // PCIe supports add and xchg for system atomics.
20281 return atomicSupportedIfLegalIntType(RMW);
20282 case AtomicRMWInst::Sub:
20283 case AtomicRMWInst::And:
20284 case AtomicRMWInst::Or:
20285 case AtomicRMWInst::Xor:
20286 case AtomicRMWInst::Max:
20287 case AtomicRMWInst::Min:
20288 case AtomicRMWInst::UMax:
20289 case AtomicRMWInst::UMin:
20290 case AtomicRMWInst::UIncWrap:
20291 case AtomicRMWInst::UDecWrap:
20292 case AtomicRMWInst::USubCond:
20293 case AtomicRMWInst::USubSat: {
20294 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
20295 return AtomicExpansionKind::CmpXChg;
20296 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
20297 return AtomicExpansionKind::CmpXChg;
20298 if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) {
20299 auto *IT = dyn_cast<IntegerType>(Val: RMW->getType());
20300 if (!IT || IT->getBitWidth() != 32)
20301 return AtomicExpansionKind::CmpXChg;
20302 }
20303
20304 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
20305 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
20306 if (Subtarget->hasEmulatedSystemScopeAtomics())
20307 return atomicSupportedIfLegalIntType(RMW);
20308
20309 // On most subtargets, for atomicrmw operations other than add/xchg,
20310 // whether or not the instructions will behave correctly depends on where
20311 // the address physically resides and what interconnect is used in the
20312 // system configuration. On some some targets the instruction will nop,
20313 // and in others synchronization will only occur at degraded device scope.
20314 //
20315 // If the allocation is known local to the device, the instructions should
20316 // work correctly.
20317 if (RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
20318 return atomicSupportedIfLegalIntType(RMW);
20319
20320 // If fine-grained remote memory works at device scope, we don't need to
20321 // do anything.
20322 if (!HasSystemScope &&
20323 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
20324 return atomicSupportedIfLegalIntType(RMW);
20325
20326 // If we are targeting a remote allocated address, it depends what kind of
20327 // allocation the address belongs to.
20328 //
20329 // If the allocation is fine-grained (in host memory, or in PCIe peer
20330 // device memory), the operation will fail depending on the target.
20331 //
20332 // Note fine-grained host memory access does work on APUs or if XGMI is
20333 // used, but we do not know if we are targeting an APU or the system
20334 // configuration from the ISA version/target-cpu.
20335 if (RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory"))
20336 return atomicSupportedIfLegalIntType(RMW);
20337
20338 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
20339 Op == AtomicRMWInst::Xor) {
20340 // Atomic sub/or/xor do not work over PCI express, but atomic add
20341 // does. InstCombine transforms these with 0 to or, so undo that.
20342 if (const Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
20343 ConstVal && ConstVal->isNullValue())
20344 return AtomicExpansionKind::CustomExpand;
20345 }
20346
20347 // If the allocation could be in remote, fine-grained memory, the rmw
20348 // instructions may fail. cmpxchg should work, so emit that. On some
20349 // system configurations, PCIe atomics aren't supported so cmpxchg won't
20350 // even work, so you're out of luck anyway.
20351
20352 // In summary:
20353 //
20354 // Cases that may fail:
20355 // - fine-grained pinned host memory
20356 // - fine-grained migratable host memory
20357 // - fine-grained PCIe peer device
20358 //
20359 // Cases that should work, but may be treated overly conservatively.
20360 // - fine-grained host memory on an APU
20361 // - fine-grained XGMI peer device
20362 return AtomicExpansionKind::CmpXChg;
20363 }
20364
20365 return atomicSupportedIfLegalIntType(RMW);
20366 }
20367 case AtomicRMWInst::FAdd: {
20368 Type *Ty = RMW->getType();
20369
20370 // TODO: Handle REGION_ADDRESS
20371 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
20372 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
20373 // is fixed to round-to-nearest-even.
20374 //
20375 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
20376 // round-to-nearest-even.
20377 //
20378 // We ignore the rounding mode problem, even in strictfp. The C++ standard
20379 // suggests it is OK if the floating-point mode may not match the calling
20380 // thread.
20381 if (Ty->isFloatTy()) {
20382 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
20383 : AtomicExpansionKind::CmpXChg;
20384 }
20385
20386 if (Ty->isDoubleTy()) {
20387 // Ignores denormal mode, but we don't consider flushing mandatory.
20388 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
20389 : AtomicExpansionKind::CmpXChg;
20390 }
20391
20392 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
20393 return AtomicExpansionKind::None;
20394
20395 return AtomicExpansionKind::CmpXChg;
20396 }
20397
20398 // LDS atomics respect the denormal mode from the mode register.
20399 //
20400 // Traditionally f32 global/buffer memory atomics would unconditionally
20401 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
20402 // flush.
20403 //
20404 // On targets with flat atomic fadd, denormals would flush depending on
20405 // whether the target address resides in LDS or global memory. We consider
20406 // this flat-maybe-flush as will-flush.
20407 if (Ty->isFloatTy() &&
20408 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
20409 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
20410 return AtomicExpansionKind::CmpXChg;
20411
20412 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
20413 // safe. The message phrasing also should be better.
20414 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
20415 if (AS == AMDGPUAS::FLAT_ADDRESS) {
20416 // gfx942, gfx12
20417 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
20418 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20419 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
20420 // gfx90a, gfx942, gfx12
20421 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
20422 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20423
20424 // gfx942, gfx12
20425 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
20426 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20427 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
20428 // gfx90a, gfx942, gfx12
20429 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
20430 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20431
20432 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
20433 // buffer. gfx12 does have the buffer version.
20434 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
20435 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20436 }
20437
20438 // global and flat atomic fadd f64: gfx90a, gfx942.
20439 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
20440 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20441
20442 if (AS != AMDGPUAS::FLAT_ADDRESS) {
20443 if (Ty->isFloatTy()) {
20444 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
20445 // gfx11+.
20446 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20447 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20448 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
20449 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20450 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20451 } else {
20452 // gfx908
20453 if (RMW->use_empty() &&
20454 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
20455 isV2F16(Ty))
20456 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20457 }
20458 }
20459
20460 // flat atomic fadd f32: gfx942, gfx11+.
20461 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
20462 if (Subtarget->hasFlatAtomicFaddF32Inst())
20463 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20464
20465 // If it is in flat address space, and the type is float, we will try to
20466 // expand it, if the target supports global and lds atomic fadd. The
20467 // reason we need that is, in the expansion, we emit the check of
20468 // address space. If it is in global address space, we emit the global
20469 // atomic fadd; if it is in shared address space, we emit the LDS atomic
20470 // fadd.
20471 if (Subtarget->hasLDSFPAtomicAddF32()) {
20472 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20473 return AtomicExpansionKind::CustomExpand;
20474 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20475 return AtomicExpansionKind::CustomExpand;
20476 }
20477 }
20478 }
20479
20480 return AtomicExpansionKind::CmpXChg;
20481 }
20482 case AtomicRMWInst::FMin:
20483 case AtomicRMWInst::FMax: {
20484 Type *Ty = RMW->getType();
20485
20486 // LDS float and double fmin/fmax were always supported.
20487 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
20488 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
20489 : AtomicExpansionKind::CmpXChg;
20490 }
20491
20492 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
20493 // For flat and global cases:
20494 // float, double in gfx7. Manual claims denormal support.
20495 // Removed in gfx8.
20496 // float, double restored in gfx10.
20497 // double removed again in gfx11, so only f32 for gfx11/gfx12.
20498 //
20499 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
20500 // no f32.
20501 if (AS == AMDGPUAS::FLAT_ADDRESS) {
20502 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
20503 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20504 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
20505 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20506 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
20507 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
20508 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
20509 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20510 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
20511 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20512 }
20513 }
20514
20515 return AtomicExpansionKind::CmpXChg;
20516 }
20517 case AtomicRMWInst::Nand:
20518 case AtomicRMWInst::FSub:
20519 default:
20520 return AtomicExpansionKind::CmpXChg;
20521 }
20522
20523 llvm_unreachable("covered atomicrmw op switch");
20524}
20525
20526TargetLowering::AtomicExpansionKind
20527SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
20528 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
20529 ? getPrivateAtomicExpansionKind(STI: *getSubtarget())
20530 : AtomicExpansionKind::None;
20531}
20532
20533TargetLowering::AtomicExpansionKind
20534SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
20535 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
20536 ? getPrivateAtomicExpansionKind(STI: *getSubtarget())
20537 : AtomicExpansionKind::None;
20538}
20539
20540TargetLowering::AtomicExpansionKind
20541SITargetLowering::shouldExpandAtomicCmpXchgInIR(
20542 const AtomicCmpXchgInst *CmpX) const {
20543 unsigned AddrSpace = CmpX->getPointerAddressSpace();
20544 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
20545 return getPrivateAtomicExpansionKind(STI: *getSubtarget());
20546
20547 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(I: CmpX))
20548 return AtomicExpansionKind::None;
20549
20550 const DataLayout &DL = CmpX->getDataLayout();
20551
20552 Type *ValTy = CmpX->getNewValOperand()->getType();
20553
20554 // If a 64-bit flat atomic may alias private, we need to avoid using the
20555 // atomic in the private case.
20556 return DL.getTypeSizeInBits(Ty: ValTy) == 64 ? AtomicExpansionKind::CustomExpand
20557 : AtomicExpansionKind::None;
20558}
20559
20560const TargetRegisterClass *
20561SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
20562 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false);
20563 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
20564 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
20565 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
20566 : &AMDGPU::SReg_32RegClass;
20567 if (!TRI->isSGPRClass(RC) && !isDivergent)
20568 return TRI->getEquivalentSGPRClass(VRC: RC);
20569 if (TRI->isSGPRClass(RC) && isDivergent) {
20570 if (Subtarget->hasGFX90AInsts())
20571 return TRI->getEquivalentAVClass(SRC: RC);
20572 return TRI->getEquivalentVGPRClass(SRC: RC);
20573 }
20574
20575 return RC;
20576}
20577
20578// FIXME: This is a workaround for DivergenceAnalysis not understanding always
20579// uniform values (as produced by the mask results of control flow intrinsics)
20580// used outside of divergent blocks. The phi users need to also be treated as
20581// always uniform.
20582//
20583// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
20584static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
20585 unsigned WaveSize) {
20586 // FIXME: We assume we never cast the mask results of a control flow
20587 // intrinsic.
20588 // Early exit if the type won't be consistent as a compile time hack.
20589 IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
20590 if (!IT || IT->getBitWidth() != WaveSize)
20591 return false;
20592
20593 if (!isa<Instruction>(Val: V))
20594 return false;
20595 if (!Visited.insert(Ptr: V).second)
20596 return false;
20597 bool Result = false;
20598 for (const auto *U : V->users()) {
20599 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
20600 if (V == U->getOperand(i: 1)) {
20601 switch (Intrinsic->getIntrinsicID()) {
20602 default:
20603 Result = false;
20604 break;
20605 case Intrinsic::amdgcn_if_break:
20606 case Intrinsic::amdgcn_if:
20607 case Intrinsic::amdgcn_else:
20608 Result = true;
20609 break;
20610 }
20611 }
20612 if (V == U->getOperand(i: 0)) {
20613 switch (Intrinsic->getIntrinsicID()) {
20614 default:
20615 Result = false;
20616 break;
20617 case Intrinsic::amdgcn_end_cf:
20618 case Intrinsic::amdgcn_loop:
20619 Result = true;
20620 break;
20621 }
20622 }
20623 } else {
20624 Result = hasCFUser(V: U, Visited, WaveSize);
20625 }
20626 if (Result)
20627 break;
20628 }
20629 return Result;
20630}
20631
20632bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
20633 const Value *V) const {
20634 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
20635 if (CI->isInlineAsm()) {
20636 // FIXME: This cannot give a correct answer. This should only trigger in
20637 // the case where inline asm returns mixed SGPR and VGPR results, used
20638 // outside the defining block. We don't have a specific result to
20639 // consider, so this assumes if any value is SGPR, the overall register
20640 // also needs to be SGPR.
20641 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
20642 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
20643 DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI);
20644 for (auto &TC : TargetConstraints) {
20645 if (TC.Type == InlineAsm::isOutput) {
20646 ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
20647 const TargetRegisterClass *RC =
20648 getRegForInlineAsmConstraint(TRI_: SIRI, Constraint: TC.ConstraintCode,
20649 VT: TC.ConstraintVT)
20650 .second;
20651 if (RC && SIRI->isSGPRClass(RC))
20652 return true;
20653 }
20654 }
20655 }
20656 }
20657 SmallPtrSet<const Value *, 16> Visited;
20658 return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize());
20659}
20660
20661bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
20662 for (SDUse &Use : N->uses()) {
20663 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: Use.getUser())) {
20664 if (getBasePtrIndex(N: M) == Use.getOperandNo())
20665 return true;
20666 }
20667 }
20668 return false;
20669}
20670
20671bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
20672 SDValue N1) const {
20673 if (!N0.hasOneUse())
20674 return false;
20675 // Take care of the opportunity to keep N0 uniform
20676 if (N0->isDivergent() || !N1->isDivergent())
20677 return true;
20678 // Check if we have a good chance to form the memory access pattern with the
20679 // base and offset
20680 return (DAG.isBaseWithConstantOffset(Op: N0) &&
20681 hasMemSDNodeUser(N: *N0->user_begin()));
20682}
20683
20684bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
20685 Register N0, Register N1) const {
20686 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
20687}
20688
20689MachineMemOperand::Flags
20690SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
20691 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
20692 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
20693 if (I.getMetadata(Kind: "amdgpu.noclobber"))
20694 Flags |= MONoClobber;
20695 if (I.getMetadata(Kind: "amdgpu.last.use"))
20696 Flags |= MOLastUse;
20697 return Flags;
20698}
20699
20700void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
20701 Instruction *AI) const {
20702 // Given: atomicrmw fadd ptr %addr, float %val ordering
20703 //
20704 // With this expansion we produce the following code:
20705 // [...]
20706 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
20707 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
20708 //
20709 // atomicrmw.shared:
20710 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
20711 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
20712 // float %val ordering
20713 // br label %atomicrmw.phi
20714 //
20715 // atomicrmw.check.private:
20716 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
20717 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
20718 //
20719 // atomicrmw.private:
20720 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
20721 // %loaded.private = load float, ptr addrspace(5) %cast.private
20722 // %val.new = fadd float %loaded.private, %val
20723 // store float %val.new, ptr addrspace(5) %cast.private
20724 // br label %atomicrmw.phi
20725 //
20726 // atomicrmw.global:
20727 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
20728 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
20729 // float %val ordering
20730 // br label %atomicrmw.phi
20731 //
20732 // atomicrmw.phi:
20733 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
20734 // [ %loaded.private, %atomicrmw.private ],
20735 // [ %loaded.global, %atomicrmw.global ]
20736 // br label %atomicrmw.end
20737 //
20738 // atomicrmw.end:
20739 // [...]
20740 //
20741 //
20742 // For 64-bit atomics which may reside in private memory, we perform a simpler
20743 // version that only inserts the private check, and uses the flat operation.
20744
20745 IRBuilder<> Builder(AI);
20746 LLVMContext &Ctx = Builder.getContext();
20747
20748 auto *RMW = dyn_cast<AtomicRMWInst>(Val: AI);
20749 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
20750 : AtomicCmpXchgInst::getPointerOperandIndex();
20751 Value *Addr = AI->getOperand(i: PtrOpIdx);
20752
20753 /// TODO: Only need to check private, then emit flat-known-not private (no
20754 /// need for shared block, or cast to global).
20755 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(Val: AI);
20756
20757 Align Alignment;
20758 if (RMW)
20759 Alignment = RMW->getAlign();
20760 else if (CX)
20761 Alignment = CX->getAlign();
20762 else
20763 llvm_unreachable("unhandled atomic operation");
20764
20765 // FullFlatEmulation is true if we need to issue the private, shared, and
20766 // global cases.
20767 //
20768 // If this is false, we are only dealing with the flat-targeting-private case,
20769 // where we only insert a check for private and still use the flat instruction
20770 // for global and shared.
20771
20772 bool FullFlatEmulation =
20773 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
20774 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
20775 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
20776 RMW->getType()->isDoubleTy()));
20777
20778 // If the return value isn't used, do not introduce a false use in the phi.
20779 bool ReturnValueIsUsed = !AI->use_empty();
20780
20781 BasicBlock *BB = Builder.GetInsertBlock();
20782 Function *F = BB->getParent();
20783 BasicBlock *ExitBB =
20784 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
20785 BasicBlock *SharedBB = nullptr;
20786
20787 BasicBlock *CheckPrivateBB = BB;
20788 if (FullFlatEmulation) {
20789 SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
20790 CheckPrivateBB =
20791 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
20792 }
20793
20794 BasicBlock *PrivateBB =
20795 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
20796 BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
20797 BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
20798
20799 std::prev(x: BB->end())->eraseFromParent();
20800 Builder.SetInsertPoint(BB);
20801
20802 Value *LoadedShared = nullptr;
20803 if (FullFlatEmulation) {
20804 Value *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared,
20805 Args: {Addr}, FMFSource: nullptr, Name: "is.shared");
20806 Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
20807 Builder.SetInsertPoint(SharedBB);
20808 Value *CastToLocal = Builder.CreateAddrSpaceCast(
20809 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
20810
20811 Instruction *Clone = AI->clone();
20812 Clone->insertInto(ParentBB: SharedBB, It: SharedBB->end());
20813 Clone->getOperandUse(i: PtrOpIdx).set(CastToLocal);
20814 LoadedShared = Clone;
20815
20816 Builder.CreateBr(Dest: PhiBB);
20817 Builder.SetInsertPoint(CheckPrivateBB);
20818 }
20819
20820 Value *IsPrivate = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_private,
20821 Args: {Addr}, FMFSource: nullptr, Name: "is.private");
20822 Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
20823
20824 Builder.SetInsertPoint(PrivateBB);
20825
20826 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
20827 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
20828
20829 Value *LoadedPrivate;
20830 if (RMW) {
20831 LoadedPrivate = Builder.CreateAlignedLoad(
20832 Ty: RMW->getType(), Ptr: CastToPrivate, Align: RMW->getAlign(), Name: "loaded.private");
20833
20834 Value *NewVal = buildAtomicRMWValue(Op: RMW->getOperation(), Builder,
20835 Loaded: LoadedPrivate, Val: RMW->getValOperand());
20836
20837 Builder.CreateAlignedStore(Val: NewVal, Ptr: CastToPrivate, Align: RMW->getAlign());
20838 } else {
20839 auto [ResultLoad, Equal] =
20840 buildCmpXchgValue(Builder, Ptr: CastToPrivate, Cmp: CX->getCompareOperand(),
20841 Val: CX->getNewValOperand(), Alignment: CX->getAlign());
20842
20843 Value *Insert = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CX->getType()),
20844 Val: ResultLoad, Idxs: 0);
20845 LoadedPrivate = Builder.CreateInsertValue(Agg: Insert, Val: Equal, Idxs: 1);
20846 }
20847
20848 Builder.CreateBr(Dest: PhiBB);
20849
20850 Builder.SetInsertPoint(GlobalBB);
20851
20852 // Continue using a flat instruction if we only emitted the check for private.
20853 Instruction *LoadedGlobal = AI;
20854 if (FullFlatEmulation) {
20855 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
20856 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
20857 AI->getOperandUse(i: PtrOpIdx).set(CastToGlobal);
20858 }
20859
20860 AI->removeFromParent();
20861 AI->insertInto(ParentBB: GlobalBB, It: GlobalBB->end());
20862
20863 // The new atomicrmw may go through another round of legalization later.
20864 if (!FullFlatEmulation) {
20865 // We inserted the runtime check already, make sure we do not try to
20866 // re-expand this.
20867 // TODO: Should union with any existing metadata.
20868 MDBuilder MDB(F->getContext());
20869 MDNode *RangeNotPrivate =
20870 MDB.createRange(Lo: APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
20871 Hi: APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
20872 LoadedGlobal->setMetadata(KindID: LLVMContext::MD_noalias_addrspace,
20873 Node: RangeNotPrivate);
20874 }
20875
20876 Builder.CreateBr(Dest: PhiBB);
20877
20878 Builder.SetInsertPoint(PhiBB);
20879
20880 if (ReturnValueIsUsed) {
20881 PHINode *Loaded = Builder.CreatePHI(Ty: AI->getType(), NumReservedValues: 3);
20882 AI->replaceAllUsesWith(V: Loaded);
20883 if (FullFlatEmulation)
20884 Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
20885 Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
20886 Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
20887 Loaded->takeName(V: AI);
20888 }
20889
20890 Builder.CreateBr(Dest: ExitBB);
20891}
20892
20893static void convertScratchAtomicToFlatAtomic(Instruction *I,
20894 unsigned PtrOpIdx) {
20895 Value *PtrOp = I->getOperand(i: PtrOpIdx);
20896 assert(PtrOp->getType()->getPointerAddressSpace() ==
20897 AMDGPUAS::PRIVATE_ADDRESS);
20898
20899 Type *FlatPtr = PointerType::get(C&: I->getContext(), AddressSpace: AMDGPUAS::FLAT_ADDRESS);
20900 Value *ASCast = CastInst::CreatePointerCast(S: PtrOp, Ty: FlatPtr, Name: "scratch.ascast",
20901 InsertBefore: I->getIterator());
20902 I->setOperand(i: PtrOpIdx, Val: ASCast);
20903}
20904
20905void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
20906 AtomicRMWInst::BinOp Op = AI->getOperation();
20907
20908 if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
20909 return convertScratchAtomicToFlatAtomic(I: AI, PtrOpIdx: AI->getPointerOperandIndex());
20910
20911 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
20912 Op == AtomicRMWInst::Xor) {
20913 if (const auto *ConstVal = dyn_cast<Constant>(Val: AI->getValOperand());
20914 ConstVal && ConstVal->isNullValue()) {
20915 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
20916 AI->setOperation(AtomicRMWInst::Add);
20917
20918 // We may still need the private-alias-flat handling below.
20919
20920 // TODO: Skip this for cases where we cannot access remote memory.
20921 }
20922 }
20923
20924 // The non-flat expansions should only perform the de-canonicalization of
20925 // identity values.
20926 if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
20927 return;
20928
20929 emitExpandAtomicAddrSpacePredicate(AI);
20930}
20931
20932void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
20933 if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
20934 return convertScratchAtomicToFlatAtomic(I: CI, PtrOpIdx: CI->getPointerOperandIndex());
20935
20936 emitExpandAtomicAddrSpacePredicate(AI: CI);
20937}
20938
20939void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
20940 if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
20941 return convertScratchAtomicToFlatAtomic(I: LI, PtrOpIdx: LI->getPointerOperandIndex());
20942
20943 llvm_unreachable(
20944 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20945}
20946
20947void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
20948 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
20949 return convertScratchAtomicToFlatAtomic(I: SI, PtrOpIdx: SI->getPointerOperandIndex());
20950
20951 llvm_unreachable(
20952 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20953}
20954
20955LoadInst *
20956SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
20957 IRBuilder<> Builder(AI);
20958 auto Order = AI->getOrdering();
20959
20960 // The optimization removes store aspect of the atomicrmw. Therefore, cache
20961 // must be flushed if the atomic ordering had a release semantics. This is
20962 // not necessary a fence, a release fence just coincides to do that flush.
20963 // Avoid replacing of an atomicrmw with a release semantics.
20964 if (isReleaseOrStronger(AO: Order))
20965 return nullptr;
20966
20967 LoadInst *LI = Builder.CreateAlignedLoad(
20968 Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
20969 LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
20970 LI->copyMetadata(SrcInst: *AI);
20971 LI->takeName(V: AI);
20972 AI->replaceAllUsesWith(V: LI);
20973 AI->eraseFromParent();
20974 return LI;
20975}
20976