1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUSelectionDAGInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIMachineFunctionInfo.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/FloatingPointMode.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/UniformityAnalysis.h"
29#include "llvm/CodeGen/Analysis.h"
30#include "llvm/CodeGen/ByteProvider.h"
31#include "llvm/CodeGen/FunctionLoweringInfo.h"
32#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
33#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
34#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineLoopInfo.h"
38#include "llvm/CodeGen/MachinePassManager.h"
39#include "llvm/CodeGen/PseudoSourceValueManager.h"
40#include "llvm/CodeGen/SDPatternMatch.h"
41#include "llvm/IR/DiagnosticInfo.h"
42#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/IntrinsicInst.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
47#include "llvm/Support/CommandLine.h"
48#include "llvm/Support/KnownBits.h"
49#include "llvm/Support/ModRef.h"
50#include "llvm/Transforms/Utils/LowerAtomic.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(Val: false));
64
65static cl::opt<bool> UseDivergentRegisterIndexing(
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(Val: false));
69
70static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
71 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
72 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
73}
74
75static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
76 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
77 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
78}
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
90SITargetLowering::SITargetLowering(const TargetMachine &TM,
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass);
94 addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 32);
101 addRegisterClass(VT: MVT::f32, RC: V32RegClass);
102
103 addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 64);
107
108 addRegisterClass(VT: MVT::f64, RC: V64RegClass);
109 addRegisterClass(VT: MVT::v2f32, RC: V64RegClass);
110 addRegisterClass(VT: MVT::Untyped, RC: V64RegClass);
111
112 addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(VT: MVT::v3f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 96));
114
115 addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(VT: MVT::v4f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 128));
120
121 addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(VT: MVT::v5f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 160));
123
124 addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(VT: MVT::v6f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 192));
126
127 addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(VT: MVT::v3f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 192));
129
130 addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(VT: MVT::v7f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 224));
132
133 addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(VT: MVT::v8f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 256));
135
136 addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(VT: MVT::v4f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 256));
138
139 addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(VT: MVT::v9f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 288));
141
142 addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(VT: MVT::v10f32,
144 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 320));
145
146 addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(VT: MVT::v11f32,
148 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 352));
149
150 addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(VT: MVT::v12f32,
152 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 384));
153
154 addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(VT: MVT::v16f32,
156 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 512));
157
158 addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(VT: MVT::v8f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 512));
160
161 addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(VT: MVT::v16f64,
163 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass);
172 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass);
173 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass);
178 addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass);
179 addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass);
180 addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass);
181 addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass);
182 addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass);
183 addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(VT: MVT::v32f32,
196 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 1024));
197
198 computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
199
200 // The boolean content concept here is too inflexible. Compares only ever
201 // really produce a 1-bit result. Any copy/extend from these will turn into a
202 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
203 // it's what most targets use.
204 setBooleanContents(ZeroOrOneBooleanContent);
205 setBooleanVectorContents(ZeroOrOneBooleanContent);
206
207 // We need to custom lower vector stores from local memory
208 setOperationAction(Ops: ISD::LOAD,
209 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
213 Action: Custom);
214
215 setOperationAction(Ops: ISD::STORE,
216 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
220 Action: Custom);
221
222 if (isTypeLegal(VT: MVT::bf16)) {
223 for (unsigned Opc :
224 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
225 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
226 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
227 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
228 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
229 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
230 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
231 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
232 ISD::SETCC}) {
233 setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote);
234 }
235
236 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand);
237
238 setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote);
239 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16);
240
241 setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal);
242 setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal);
243 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal);
244
245 // We only need to custom lower because we can't specify an action for bf16
246 // sources.
247 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
248 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
249 }
250
251 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
252 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand);
253 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand);
254 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand);
255 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand);
256 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand);
257 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand);
258 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand);
259 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand);
260 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand);
261 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand);
262 setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand);
263 setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand);
264 setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand);
265 setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand);
266 setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand);
267
268 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
269 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
270 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand);
271 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand);
272 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand);
273 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand);
274 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand);
275
276 setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
277 setOperationAction(Ops: ISD::ExternalSymbol, VTs: {MVT::i32, MVT::i64}, Action: Custom);
278
279 setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote);
280 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
281 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote);
282 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64);
283
284 setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom);
285
286 setOperationAction(Ops: ISD::SELECT_CC,
287 VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand);
288
289 setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote);
290 setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand);
291 AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32);
292
293 setOperationAction(Ops: ISD::TRUNCATE,
294 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
295 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
296 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
297 Action: Expand);
298 setOperationAction(Ops: ISD::FP_ROUND,
299 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
300 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
301 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
302 Action: Expand);
303
304 setOperationAction(Ops: ISD::SIGN_EXTEND_INREG,
305 VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
306 MVT::v3i16, MVT::v4i16, MVT::Other},
307 Action: Custom);
308
309 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
310 setOperationAction(Ops: ISD::BR_CC,
311 VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand);
312
313 setOperationAction(Ops: {ISD::ABS, ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal);
314
315 setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal);
316
317 setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64,
318 Action: Expand);
319
320#if 0
321 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
322#endif
323
324 // We only support LOAD/STORE and vector manipulation ops for vectors
325 // with > 4 elements.
326 for (MVT VT :
327 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
328 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
329 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
330 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
331 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
332 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
333 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
334 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
335 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
336 switch (Op) {
337 case ISD::LOAD:
338 case ISD::STORE:
339 case ISD::BUILD_VECTOR:
340 case ISD::BITCAST:
341 case ISD::UNDEF:
342 case ISD::EXTRACT_VECTOR_ELT:
343 case ISD::INSERT_VECTOR_ELT:
344 case ISD::SCALAR_TO_VECTOR:
345 case ISD::IS_FPCLASS:
346 break;
347 case ISD::EXTRACT_SUBVECTOR:
348 case ISD::INSERT_SUBVECTOR:
349 case ISD::CONCAT_VECTORS:
350 setOperationAction(Op, VT, Action: Custom);
351 break;
352 default:
353 setOperationAction(Op, VT, Action: Expand);
354 break;
355 }
356 }
357 }
358
359 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand);
360
361 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
362 // is expanded to avoid having two separate loops in case the index is a VGPR.
363
364 // Most operations are naturally 32-bit vector operations. We only support
365 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
366 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
367 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
368 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
369
370 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
371 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
372
373 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
374 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
375
376 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
377 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
378 }
379
380 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
381 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
382 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
383
384 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
385 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
386
387 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
388 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
389
390 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
391 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
392 }
393
394 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
395 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
396 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
397
398 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
399 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
400
401 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
402 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
403
404 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
405 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
406 }
407
408 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
409 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
410 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
411
412 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
413 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
414
415 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
416 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
417
418 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
419 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
420 }
421
422 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
423 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
424 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
425
426 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
427 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
428
429 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
430 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
431
432 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
433 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
434 }
435
436 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
437 VTs: {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
438 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
439 Action: Custom);
440
441 if (Subtarget->hasPkMovB32()) {
442 // TODO: 16-bit element vectors should be legal with even aligned elements.
443 // TODO: Can be legal with wider source types than the result with
444 // subregister extracts.
445 setOperationAction(Ops: ISD::VECTOR_SHUFFLE, VTs: {MVT::v2i32, MVT::v2f32}, Action: Legal);
446 }
447
448 setOperationAction(Ops: {ISD::AND, ISD::OR, ISD::XOR}, VT: MVT::v2i32, Action: Legal);
449 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
450 // instead lower to cndmask in SITargetLowering::LowerSELECT().
451 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i32, Action: Custom);
452 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
453 // alignbit.
454 setOperationAction(Op: ISD::ROTR, VT: MVT::v2i32, Action: Custom);
455
456 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
457 Action: Custom);
458
459 // Avoid stack access for these.
460 // TODO: Generalize to more vector types.
461 setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
462 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
463 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
464 Action: Custom);
465
466 // Deal with vec3 vector operations when widened to vec4.
467 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
468 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom);
469
470 // Deal with vec5/6/7 vector operations when widened to vec8.
471 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
472 VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
473 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
474 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
475 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
476 Action: Custom);
477
478 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
479 // and output demarshalling
480 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom);
481
482 // We can't return success/failure, only the old value,
483 // let LLVM add the comparison
484 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64},
485 Action: Expand);
486
487 setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom);
488
489 setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal);
490
491 // FIXME: This should be narrowed to i32, but that only happens if i64 is
492 // illegal.
493 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
494 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal);
495
496 // On SI this is s_memtime and s_memrealtime on VI.
497 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
498
499 if (Subtarget->hasSMemRealTime() ||
500 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
501 setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal);
502 setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom);
503
504 if (Subtarget->has16BitInsts()) {
505 setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote);
506 setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom);
507 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Legal);
508 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Legal);
509 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f16, Action: Legal);
510 } else {
511 setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom);
512 }
513
514 if (Subtarget->hasMadMacF32Insts())
515 setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal);
516
517 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
518 setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
519
520 // We only really have 32-bit BFE instructions (and 16-bit on VI).
521 //
522 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
523 // effort to match them now. We want this to be false for i64 cases when the
524 // extraction isn't restricted to the upper or lower half. Ideally we would
525 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
526 // span the midpoint are probably relatively rare, so don't worry about them
527 // for now.
528 setHasExtractBitsInsn(true);
529
530 // Clamp modifier on add/sub
531 if (Subtarget->hasIntClamp())
532 setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal);
533
534 if (Subtarget->hasAddNoCarryInsts())
535 setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32},
536 Action: Legal);
537
538 setOperationAction(
539 Ops: {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
540 VTs: {MVT::f32, MVT::f64}, Action: Custom);
541
542 // These are really only legal for ieee_mode functions. We should be avoiding
543 // them for functions that don't have ieee_mode enabled, so just say they are
544 // legal.
545 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
546 VTs: {MVT::f32, MVT::f64}, Action: Legal);
547
548 if (Subtarget->haveRoundOpsF64())
549 setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64,
550 Action: Legal);
551 else
552 setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
553 VT: MVT::f64, Action: Custom);
554
555 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
556 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64},
557 Action: Legal);
558 setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom);
559
560 setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom);
561 setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom);
562
563 setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
564 setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
565
566 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i32,
567 Action: Custom);
568 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i16,
569 Action: Custom);
570 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i1,
571 Action: Custom);
572
573 // Custom lower these because we can't specify a rule based on an illegal
574 // source bf16.
575 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom);
576 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom);
577
578 if (Subtarget->has16BitInsts()) {
579 setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
580 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
581 VT: MVT::i16, Action: Legal);
582
583 AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32);
584
585 setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
586 VT: MVT::i16, Action: Expand);
587
588 setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
589 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
590 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
591 ISD::CTPOP},
592 VT: MVT::i16, Action: Promote);
593
594 setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom);
595
596 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
597
598 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote);
599 AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32);
600 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote);
601 AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32);
602
603 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom);
604 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
605 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i1, Action: Custom);
606
607 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom);
608
609 // F16 - Constant Actions.
610 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
611 setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
612
613 // F16 - Load/Store Actions.
614 setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote);
615 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
616 setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote);
617 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
618
619 // BF16 - Load/Store Actions.
620 setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote);
621 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
622 setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote);
623 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
624
625 // F16 - VOP1 Actions.
626 setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
627 ISD::FSIN, ISD::FROUND},
628 VT: MVT::f16, Action: Custom);
629
630 // BF16 - VOP1 Actions.
631 if (Subtarget->hasBF16TransInsts())
632 setOperationAction(Ops: {ISD::FCOS, ISD::FSIN, ISD::FDIV}, VT: MVT::bf16, Action: Custom);
633
634 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
635 ISD::FP_TO_UINT_SAT},
636 VT: MVT::f16, Action: Promote);
637 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
638 ISD::FP_TO_UINT_SAT},
639 VT: MVT::bf16, Action: Promote);
640
641 // F16 - VOP2 Actions.
642 setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16},
643 Action: Expand);
644 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom);
645 setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom);
646 setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom);
647
648 // F16 - VOP3 Actions.
649 setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal);
650 if (STI.hasMadF16())
651 setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal);
652
653 for (MVT VT :
654 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
655 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
656 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
657 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
658 switch (Op) {
659 case ISD::LOAD:
660 case ISD::STORE:
661 case ISD::BUILD_VECTOR:
662 case ISD::BITCAST:
663 case ISD::UNDEF:
664 case ISD::EXTRACT_VECTOR_ELT:
665 case ISD::INSERT_VECTOR_ELT:
666 case ISD::INSERT_SUBVECTOR:
667 case ISD::SCALAR_TO_VECTOR:
668 case ISD::IS_FPCLASS:
669 break;
670 case ISD::EXTRACT_SUBVECTOR:
671 case ISD::CONCAT_VECTORS:
672 case ISD::FSIN:
673 case ISD::FCOS:
674 setOperationAction(Op, VT, Action: Custom);
675 break;
676 default:
677 setOperationAction(Op, VT, Action: Expand);
678 break;
679 }
680 }
681 }
682
683 // v_perm_b32 can handle either of these.
684 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal);
685 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom);
686
687 // XXX - Do these do anything? Vector constants turn into build_vector.
688 setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal);
689
690 setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
691 Action: Legal);
692
693 setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote);
694 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
695 setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote);
696 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
697
698 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote);
699 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
700 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote);
701 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
702
703 setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote);
704 AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32);
705 setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote);
706 AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
707 setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote);
708 AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
709
710 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote);
711 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
712 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote);
713 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
714 setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote);
715 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
716
717 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
718 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
719 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
720 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
721 setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote);
722 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
723
724 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote);
725 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
726 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote);
727 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
728 setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote);
729 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
730
731 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
732 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
733 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
734 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
735
736 setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote);
737 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
738 setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote);
739 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
740 setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote);
741 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
742
743 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote);
744 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
745 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote);
746 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
747 setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote);
748 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
749
750 setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote);
751 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
752 setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote);
753 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
754 setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote);
755 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
756
757 setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote);
758 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
759 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote);
760 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
761 setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote);
762 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
763
764 setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote);
765 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
766 setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote);
767 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
768 setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote);
769 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
770
771 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
772 VT: MVT::v2i32, Action: Expand);
773 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand);
774
775 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
776 VT: MVT::v4i32, Action: Expand);
777
778 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
779 VT: MVT::v8i32, Action: Expand);
780
781 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
782 Action: Subtarget->hasVOP3PInsts() ? Legal : Custom);
783
784 setOperationAction(Ops: ISD::FNEG, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
785 // This isn't really legal, but this avoids the legalizer unrolling it (and
786 // allows matching fneg (fabs x) patterns)
787 setOperationAction(Ops: ISD::FABS, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
788
789 // Can do this in one BFI plus a constant materialize.
790 setOperationAction(Ops: ISD::FCOPYSIGN,
791 VTs: {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
792 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
793 MVT::v32f16, MVT::v32bf16},
794 Action: Custom);
795
796 setOperationAction(
797 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
798 VT: MVT::f16, Action: Custom);
799 setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal);
800
801 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
802 ISD::FMAXIMUMNUM},
803 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
804 Action: Custom);
805
806 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM},
807 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
808 Action: Expand);
809
810 for (MVT Vec16 :
811 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
812 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
813 setOperationAction(
814 Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
815 VT: Vec16, Action: Custom);
816 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand);
817 }
818 }
819
820 if (Subtarget->hasVOP3PInsts()) {
821 setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
822 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
823 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
824 VT: MVT::v2i16, Action: Legal);
825
826 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
827 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
828 VT: MVT::v2f16, Action: Legal);
829
830 setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT,
831 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Action: Custom);
832
833 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
834 VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
835 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
836 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
837 Action: Custom);
838
839 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
840 // Split vector operations.
841 setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
842 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
843 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
844 ISD::SSUBSAT},
845 VT, Action: Custom);
846
847 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
848 // Split vector operations.
849 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
850 VT, Action: Custom);
851
852 setOperationAction(
853 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
854 VTs: {MVT::v2f16, MVT::v4f16}, Action: Custom);
855
856 setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom);
857 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
858 Action: Custom);
859
860 if (Subtarget->hasBF16PackedInsts()) {
861 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
862 // Split vector operations.
863 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
864 VT, Action: Custom);
865 }
866
867 if (Subtarget->hasPackedFP32Ops()) {
868 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
869 VT: MVT::v2f32, Action: Legal);
870 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA},
871 VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
872 Action: Custom);
873 }
874 }
875
876 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom);
877
878 if (Subtarget->has16BitInsts()) {
879 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote);
880 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32);
881 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote);
882 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32);
883 } else {
884 // Legalization hack.
885 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom);
886
887 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom);
888 }
889
890 setOperationAction(Ops: ISD::SELECT,
891 VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
892 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
893 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
894 MVT::v32f16, MVT::v32bf16},
895 Action: Custom);
896
897 setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom);
898
899 if (Subtarget->hasVectorMulU64())
900 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Legal);
901 else if (Subtarget->hasScalarSMulU64())
902 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom);
903
904 if (Subtarget->hasMad64_32())
905 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom);
906
907 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
908 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
909
910 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
911 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM},
912 VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal);
913 } else {
914 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
915 if (Subtarget->hasMinimum3Maximum3F32())
916 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f32, Action: Legal);
917
918 if (Subtarget->hasMinimum3Maximum3PKF16()) {
919 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::v2f16, Action: Legal);
920
921 // If only the vector form is available, we need to widen to a vector.
922 if (!Subtarget->hasMinimum3Maximum3F16())
923 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f16, Action: Custom);
924 }
925 }
926
927 if (Subtarget->hasVOP3PInsts()) {
928 // We want to break these into v2f16 pieces, not scalarize.
929 setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM},
930 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
931 Action: Custom);
932 }
933
934 if (Subtarget->hasIntMinMax64())
935 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i64,
936 Action: Legal);
937
938 setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN,
939 VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
940 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
941 MVT::i8},
942 Action: Custom);
943
944 setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN,
945 VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
946 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
947 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
948 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
949 Action: Custom);
950
951 setOperationAction(Ops: ISD::INTRINSIC_VOID,
952 VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
953 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
954 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
955 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
956 Action: Custom);
957
958 setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom);
959 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
960 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
961 setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom);
962 setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom);
963
964 // TODO: Could move this to custom lowering, could benefit from combines on
965 // extract of relevant bits.
966 setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal);
967
968 setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote);
969
970 if (Subtarget->hasBF16ConversionInsts()) {
971 setOperationAction(Ops: ISD::FP_ROUND, VTs: {MVT::bf16, MVT::v2bf16}, Action: Custom);
972 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2bf16, Action: Legal);
973 }
974
975 if (Subtarget->hasBF16PackedInsts()) {
976 setOperationAction(
977 Ops: {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
978 VT: MVT::v2bf16, Action: Legal);
979 }
980
981 if (Subtarget->hasBF16TransInsts()) {
982 setOperationAction(Ops: {ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, VT: MVT::bf16, Action: Legal);
983 }
984
985 if (Subtarget->hasCvtPkF16F32Inst()) {
986 setOperationAction(Ops: ISD::FP_ROUND,
987 VTs: {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
988 Action: Custom);
989 }
990
991 setTargetDAGCombine({ISD::ADD,
992 ISD::PTRADD,
993 ISD::UADDO_CARRY,
994 ISD::SUB,
995 ISD::USUBO_CARRY,
996 ISD::MUL,
997 ISD::FADD,
998 ISD::FSUB,
999 ISD::FDIV,
1000 ISD::FMUL,
1001 ISD::FMINNUM,
1002 ISD::FMAXNUM,
1003 ISD::FMINNUM_IEEE,
1004 ISD::FMAXNUM_IEEE,
1005 ISD::FMINIMUM,
1006 ISD::FMAXIMUM,
1007 ISD::FMINIMUMNUM,
1008 ISD::FMAXIMUMNUM,
1009 ISD::FMA,
1010 ISD::SMIN,
1011 ISD::SMAX,
1012 ISD::UMIN,
1013 ISD::UMAX,
1014 ISD::SETCC,
1015 ISD::SELECT,
1016 ISD::SMIN,
1017 ISD::SMAX,
1018 ISD::UMIN,
1019 ISD::UMAX,
1020 ISD::AND,
1021 ISD::OR,
1022 ISD::XOR,
1023 ISD::SHL,
1024 ISD::SRL,
1025 ISD::SRA,
1026 ISD::FSHR,
1027 ISD::SINT_TO_FP,
1028 ISD::UINT_TO_FP,
1029 ISD::FCANONICALIZE,
1030 ISD::SCALAR_TO_VECTOR,
1031 ISD::ZERO_EXTEND,
1032 ISD::SIGN_EXTEND_INREG,
1033 ISD::ANY_EXTEND,
1034 ISD::EXTRACT_VECTOR_ELT,
1035 ISD::INSERT_VECTOR_ELT,
1036 ISD::FCOPYSIGN});
1037
1038 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1039 setTargetDAGCombine(ISD::FP_ROUND);
1040
1041 // All memory operations. Some folding on the pointer operand is done to help
1042 // matching the constant offsets in the addressing modes.
1043 setTargetDAGCombine({ISD::LOAD,
1044 ISD::STORE,
1045 ISD::ATOMIC_LOAD,
1046 ISD::ATOMIC_STORE,
1047 ISD::ATOMIC_CMP_SWAP,
1048 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1049 ISD::ATOMIC_SWAP,
1050 ISD::ATOMIC_LOAD_ADD,
1051 ISD::ATOMIC_LOAD_SUB,
1052 ISD::ATOMIC_LOAD_AND,
1053 ISD::ATOMIC_LOAD_OR,
1054 ISD::ATOMIC_LOAD_XOR,
1055 ISD::ATOMIC_LOAD_NAND,
1056 ISD::ATOMIC_LOAD_MIN,
1057 ISD::ATOMIC_LOAD_MAX,
1058 ISD::ATOMIC_LOAD_UMIN,
1059 ISD::ATOMIC_LOAD_UMAX,
1060 ISD::ATOMIC_LOAD_FADD,
1061 ISD::ATOMIC_LOAD_FMIN,
1062 ISD::ATOMIC_LOAD_FMAX,
1063 ISD::ATOMIC_LOAD_UINC_WRAP,
1064 ISD::ATOMIC_LOAD_UDEC_WRAP,
1065 ISD::ATOMIC_LOAD_USUB_COND,
1066 ISD::ATOMIC_LOAD_USUB_SAT,
1067 ISD::INTRINSIC_VOID,
1068 ISD::INTRINSIC_W_CHAIN});
1069
1070 // FIXME: In other contexts we pretend this is a per-function property.
1071 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1072
1073 setSchedulingPreference(Sched::RegPressure);
1074}
1075
1076const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1077
1078ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1079 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1080 return RCRegs;
1081}
1082
1083//===----------------------------------------------------------------------===//
1084// TargetLowering queries
1085//===----------------------------------------------------------------------===//
1086
1087// v_mad_mix* support a conversion from f16 to f32.
1088//
1089// There is only one special case when denormals are enabled we don't currently,
1090// where this is OK to use.
1091bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1092 EVT DestVT, EVT SrcVT) const {
1093 return DestVT.getScalarType() == MVT::f32 &&
1094 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1095 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1096 SrcVT.getScalarType() == MVT::f16) ||
1097 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1098 SrcVT.getScalarType() == MVT::bf16)) &&
1099 // TODO: This probably only requires no input flushing?
1100 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
1101}
1102
1103bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1104 LLT DestTy, LLT SrcTy) const {
1105 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1106 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1107 DestTy.getScalarSizeInBits() == 32 &&
1108 SrcTy.getScalarSizeInBits() == 16 &&
1109 // TODO: This probably only requires no input flushing?
1110 denormalModeIsFlushAllF32(MF: *MI.getMF());
1111}
1112
1113bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1114 // SI has some legal vector types, but no legal vector operations. Say no
1115 // shuffles are legal in order to prefer scalarizing some vector operations.
1116 return false;
1117}
1118
1119MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1120 CallingConv::ID CC,
1121 EVT VT) const {
1122 if (CC == CallingConv::AMDGPU_KERNEL)
1123 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1124
1125 if (VT.isVector()) {
1126 EVT ScalarVT = VT.getScalarType();
1127 unsigned Size = ScalarVT.getSizeInBits();
1128 if (Size == 16) {
1129 return Subtarget->has16BitInsts()
1130 ? MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), NumElements: 2)
1131 : MVT::i32;
1132 }
1133
1134 if (Size < 16)
1135 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1136 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1137 }
1138
1139 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1140 return MVT::i32;
1141
1142 if (VT.getSizeInBits() > 32)
1143 return MVT::i32;
1144
1145 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1146}
1147
1148unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1149 CallingConv::ID CC,
1150 EVT VT) const {
1151 if (CC == CallingConv::AMDGPU_KERNEL)
1152 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1153
1154 if (VT.isVector()) {
1155 unsigned NumElts = VT.getVectorNumElements();
1156 EVT ScalarVT = VT.getScalarType();
1157 unsigned Size = ScalarVT.getSizeInBits();
1158
1159 // FIXME: Should probably promote 8-bit vectors to i16.
1160 if (Size == 16)
1161 return (NumElts + 1) / 2;
1162
1163 if (Size <= 32)
1164 return NumElts;
1165
1166 if (Size > 32)
1167 return NumElts * ((Size + 31) / 32);
1168 } else if (VT.getSizeInBits() > 32)
1169 return (VT.getSizeInBits() + 31) / 32;
1170
1171 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1172}
1173
1174unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1175 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1176 unsigned &NumIntermediates, MVT &RegisterVT) const {
1177 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1178 unsigned NumElts = VT.getVectorNumElements();
1179 EVT ScalarVT = VT.getScalarType();
1180 unsigned Size = ScalarVT.getSizeInBits();
1181 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1182 // support, but unless we can properly handle 3-vectors, it will be still be
1183 // inconsistent.
1184 if (Size == 16) {
1185 MVT SimpleIntermediateVT =
1186 MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), EC: ElementCount::getFixed(MinVal: 2));
1187 IntermediateVT = SimpleIntermediateVT;
1188 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1189 NumIntermediates = (NumElts + 1) / 2;
1190 return (NumElts + 1) / 2;
1191 }
1192
1193 if (Size == 32) {
1194 RegisterVT = ScalarVT.getSimpleVT();
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts;
1197 return NumIntermediates;
1198 }
1199
1200 if (Size < 16 && Subtarget->has16BitInsts()) {
1201 // FIXME: Should probably form v2i16 pieces
1202 RegisterVT = MVT::i16;
1203 IntermediateVT = ScalarVT;
1204 NumIntermediates = NumElts;
1205 return NumIntermediates;
1206 }
1207
1208 if (Size != 16 && Size <= 32) {
1209 RegisterVT = MVT::i32;
1210 IntermediateVT = ScalarVT;
1211 NumIntermediates = NumElts;
1212 return NumIntermediates;
1213 }
1214
1215 if (Size > 32) {
1216 RegisterVT = MVT::i32;
1217 IntermediateVT = RegisterVT;
1218 NumIntermediates = NumElts * ((Size + 31) / 32);
1219 return NumIntermediates;
1220 }
1221 }
1222
1223 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1224 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1225}
1226
1227static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1228 const DataLayout &DL, Type *Ty,
1229 unsigned MaxNumLanes) {
1230 assert(MaxNumLanes != 0);
1231
1232 LLVMContext &Ctx = Ty->getContext();
1233 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1234 unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1235 return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()),
1236 NumElements: NumElts);
1237 }
1238
1239 return TLI.getValueType(DL, Ty);
1240}
1241
1242// Peek through TFE struct returns to only use the data size.
1243static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1244 const DataLayout &DL, Type *Ty,
1245 unsigned MaxNumLanes) {
1246 auto *ST = dyn_cast<StructType>(Val: Ty);
1247 if (!ST)
1248 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1249
1250 // TFE intrinsics return an aggregate type.
1251 assert(ST->getNumContainedTypes() == 2 &&
1252 ST->getContainedType(1)->isIntegerTy(32));
1253 return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: 0), MaxNumLanes);
1254}
1255
1256/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1257/// in-memory representation. This return value is a custom type because there
1258/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1259/// could cause issues during codegen, these address space 7 pointers will be
1260/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1261/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1262/// for cost modeling, to work. (This also sets us up decently for doing the
1263/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1264MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1265 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1266 return MVT::amdgpuBufferFatPointer;
1267 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1268 DL.getPointerSizeInBits(AS) == 192)
1269 return MVT::amdgpuBufferStridedPointer;
1270 return AMDGPUTargetLowering::getPointerTy(DL, AS);
1271}
1272/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1273/// v8i32 when padding is added.
1274/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1275/// also v8i32 with padding.
1276MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1277 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1278 DL.getPointerSizeInBits(AS) == 160) ||
1279 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1280 DL.getPointerSizeInBits(AS) == 192))
1281 return MVT::v8i32;
1282 return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1283}
1284
1285static unsigned getIntrMemWidth(unsigned IntrID) {
1286 switch (IntrID) {
1287 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1288 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1289 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1290 return 8;
1291 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1292 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1293 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1294 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1295 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1296 return 32;
1297 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1298 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1300 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1301 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1302 return 64;
1303 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1304 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1305 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1306 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1307 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1308 return 128;
1309 default:
1310 llvm_unreachable("Unknown width");
1311 }
1312}
1313
1314static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad,
1315 TargetLoweringBase::IntrinsicInfo &Info) {
1316 Value *OrderingArg = CI.getArgOperand(i: IsLoad ? 1 : 2);
1317 unsigned Ord = cast<ConstantInt>(Val: OrderingArg)->getZExtValue();
1318 switch (AtomicOrderingCABI(Ord)) {
1319 case AtomicOrderingCABI::acquire:
1320 Info.order = AtomicOrdering::Acquire;
1321 break;
1322 case AtomicOrderingCABI::release:
1323 Info.order = AtomicOrdering::Release;
1324 break;
1325 case AtomicOrderingCABI::seq_cst:
1326 Info.order = AtomicOrdering::SequentiallyConsistent;
1327 break;
1328 default:
1329 Info.order = AtomicOrdering::Monotonic;
1330 break;
1331 }
1332
1333 Info.flags =
1334 (IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore);
1335 Info.flags |= MOCooperative;
1336
1337 MDNode *ScopeMD = cast<MDNode>(
1338 Val: cast<MetadataAsValue>(Val: CI.getArgOperand(i: IsLoad ? 2 : 3))->getMetadata());
1339 StringRef Scope = cast<MDString>(Val: ScopeMD->getOperand(I: 0))->getString();
1340 Info.ssid = CI.getContext().getOrInsertSyncScopeID(SSN: Scope);
1341}
1342
1343void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
1344 const CallBase &CI,
1345 MachineFunction &MF,
1346 unsigned IntrID) const {
1347 IntrinsicInfo Info;
1348 Info.flags = MachineMemOperand::MONone;
1349 if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1350 Info.flags |= MachineMemOperand::MOInvariant;
1351 if (CI.hasMetadata(KindID: LLVMContext::MD_nontemporal))
1352 Info.flags |= MachineMemOperand::MONonTemporal;
1353 Info.flags |= getTargetMMOFlags(I: CI);
1354
1355 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1356 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1357 AttributeSet Attr =
1358 Intrinsic::getFnAttributes(C&: CI.getContext(), id: (Intrinsic::ID)IntrID);
1359 MemoryEffects ME = Attr.getMemoryEffects();
1360 if (ME.doesNotAccessMemory())
1361 return;
1362
1363 // TODO: Should images get their own address space?
1364 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1365
1366 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1367 if (RsrcIntr->IsImage) {
1368 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1369 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1370 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1371 Info.align.reset();
1372 }
1373
1374 Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1375 if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1376 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1377 // We conservatively set the memory operand of a buffer intrinsic to the
1378 // base resource pointer, so that we can access alias information about
1379 // those pointers. Cases like "this points at the same value
1380 // but with a different offset" are handled in
1381 // areMemAccessesTriviallyDisjoint.
1382 Info.ptrVal = RsrcArg;
1383 }
1384
1385 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1386 if (!IsSPrefetch) {
1387 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1388 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1389 Info.flags |= MachineMemOperand::MOVolatile;
1390 }
1391
1392 Info.flags |= MachineMemOperand::MODereferenceable;
1393 if (ME.onlyReadsMemory()) {
1394 if (RsrcIntr->IsImage) {
1395 unsigned MaxNumLanes = 4;
1396
1397 if (!BaseOpcode->Gather4) {
1398 // If this isn't a gather, we may have excess loaded elements in the
1399 // IR type. Check the dmask for the real number of elements loaded.
1400 unsigned DMask =
1401 cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue();
1402 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1403 }
1404
1405 Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(),
1406 Ty: CI.getType(), MaxNumLanes);
1407 } else {
1408 Info.memVT =
1409 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1410 MaxNumLanes: std::numeric_limits<unsigned>::max());
1411 }
1412
1413 // FIXME: What does alignment mean for an image?
1414 Info.opc = ISD::INTRINSIC_W_CHAIN;
1415 Info.flags |= MachineMemOperand::MOLoad;
1416 } else if (ME.onlyWritesMemory()) {
1417 Info.opc = ISD::INTRINSIC_VOID;
1418
1419 Type *DataTy = CI.getArgOperand(i: 0)->getType();
1420 if (RsrcIntr->IsImage) {
1421 unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue();
1422 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1423 Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy,
1424 MaxNumLanes: DMaskLanes);
1425 } else
1426 Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy);
1427
1428 Info.flags |= MachineMemOperand::MOStore;
1429 } else {
1430 // Atomic, NoReturn Sampler or prefetch
1431 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1432 : ISD::INTRINSIC_W_CHAIN;
1433 Info.flags |=
1434 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1435
1436 if (!IsSPrefetch)
1437 Info.flags |= MachineMemOperand::MOStore;
1438
1439 switch (IntrID) {
1440 default:
1441 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1442 // Fake memory access type for no return sampler intrinsics
1443 Info.memVT = MVT::i32;
1444 } else {
1445 // XXX - Should this be volatile without known ordering?
1446 Info.flags |= MachineMemOperand::MOVolatile;
1447 Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType());
1448 }
1449 break;
1450 case Intrinsic::amdgcn_raw_buffer_load_lds:
1451 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1452 case Intrinsic::amdgcn_struct_buffer_load_lds:
1453 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1454 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1455 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1456 Info.ptrVal = CI.getArgOperand(i: 1);
1457 Infos.push_back(Elt: Info);
1458 return;
1459 }
1460 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1461 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1462 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1463 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1464 Info.memVT =
1465 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1466 MaxNumLanes: std::numeric_limits<unsigned>::max());
1467 Info.flags &= ~MachineMemOperand::MOStore;
1468 Infos.push_back(Elt: Info);
1469 return;
1470 }
1471 }
1472 }
1473 Infos.push_back(Elt: Info);
1474 return;
1475 }
1476
1477 switch (IntrID) {
1478 case Intrinsic::amdgcn_ds_ordered_add:
1479 case Intrinsic::amdgcn_ds_ordered_swap: {
1480 Info.opc = ISD::INTRINSIC_W_CHAIN;
1481 Info.memVT = MVT::getVT(Ty: CI.getType());
1482 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1483 Info.align.reset();
1484 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1485
1486 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4));
1487 if (!Vol->isZero())
1488 Info.flags |= MachineMemOperand::MOVolatile;
1489
1490 Infos.push_back(Elt: Info);
1491 return;
1492 }
1493 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1494 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1495 Info.opc = ISD::INTRINSIC_W_CHAIN;
1496 Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType());
1497 Info.ptrVal = nullptr;
1498 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1499 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1500 Infos.push_back(Elt: Info);
1501 return;
1502 }
1503 case Intrinsic::amdgcn_ds_append:
1504 case Intrinsic::amdgcn_ds_consume: {
1505 Info.opc = ISD::INTRINSIC_W_CHAIN;
1506 Info.memVT = MVT::getVT(Ty: CI.getType());
1507 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1508 Info.align.reset();
1509 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1510
1511 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1));
1512 if (!Vol->isZero())
1513 Info.flags |= MachineMemOperand::MOVolatile;
1514
1515 Infos.push_back(Elt: Info);
1516 return;
1517 }
1518 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1519 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1520 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1521 ? ISD::INTRINSIC_W_CHAIN
1522 : ISD::INTRINSIC_VOID;
1523 Info.memVT = MVT::getVT(Ty: CI.getType());
1524 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1525 Info.memVT = MVT::i64;
1526 Info.size = 8;
1527 Info.align.reset();
1528 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1529 Infos.push_back(Elt: Info);
1530 return;
1531 }
1532 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1533 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1534 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1535 Info.opc = ISD::INTRINSIC_W_CHAIN;
1536 Info.memVT =
1537 MVT::getVT(Ty: IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1538 ? CI.getType()
1539 : cast<StructType>(Val: CI.getType())
1540 ->getElementType(N: 0)); // XXX: what is correct VT?
1541
1542 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1543 Info.align.reset();
1544 Info.flags |=
1545 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1546 Infos.push_back(Elt: Info);
1547 return;
1548 }
1549 case Intrinsic::amdgcn_global_atomic_fmin_num:
1550 case Intrinsic::amdgcn_global_atomic_fmax_num:
1551 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1552 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1553 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1554 Info.opc = ISD::INTRINSIC_W_CHAIN;
1555 Info.memVT = MVT::getVT(Ty: CI.getType());
1556 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1557 Info.align.reset();
1558 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1559 MachineMemOperand::MODereferenceable |
1560 MachineMemOperand::MOVolatile;
1561 Infos.push_back(Elt: Info);
1562 return;
1563 }
1564 case Intrinsic::amdgcn_flat_load_monitor_b32:
1565 case Intrinsic::amdgcn_flat_load_monitor_b64:
1566 case Intrinsic::amdgcn_flat_load_monitor_b128:
1567 case Intrinsic::amdgcn_global_load_monitor_b32:
1568 case Intrinsic::amdgcn_global_load_monitor_b64:
1569 case Intrinsic::amdgcn_global_load_monitor_b128:
1570 case Intrinsic::amdgcn_cluster_load_b32:
1571 case Intrinsic::amdgcn_cluster_load_b64:
1572 case Intrinsic::amdgcn_cluster_load_b128:
1573 case Intrinsic::amdgcn_ds_load_tr6_b96:
1574 case Intrinsic::amdgcn_ds_load_tr4_b64:
1575 case Intrinsic::amdgcn_ds_load_tr8_b64:
1576 case Intrinsic::amdgcn_ds_load_tr16_b128:
1577 case Intrinsic::amdgcn_global_load_tr6_b96:
1578 case Intrinsic::amdgcn_global_load_tr4_b64:
1579 case Intrinsic::amdgcn_global_load_tr_b64:
1580 case Intrinsic::amdgcn_global_load_tr_b128:
1581 case Intrinsic::amdgcn_ds_read_tr4_b64:
1582 case Intrinsic::amdgcn_ds_read_tr6_b96:
1583 case Intrinsic::amdgcn_ds_read_tr8_b64:
1584 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1585 Info.opc = ISD::INTRINSIC_W_CHAIN;
1586 Info.memVT = MVT::getVT(Ty: CI.getType());
1587 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1588 Info.align.reset();
1589 Info.flags |= MachineMemOperand::MOLoad;
1590 Infos.push_back(Elt: Info);
1591 return;
1592 }
1593 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1594 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1595 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1596 Info.opc = ISD::INTRINSIC_W_CHAIN;
1597 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1598 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1599 Info.align.reset();
1600 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1601 Infos.push_back(Elt: Info);
1602 return;
1603 }
1604 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1605 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1606 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1607 Info.opc = ISD::INTRINSIC_VOID;
1608 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1609 Info.ptrVal = CI.getArgOperand(i: 0);
1610 Info.align.reset();
1611 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1612 Infos.push_back(Elt: Info);
1613 return;
1614 }
1615 case Intrinsic::amdgcn_ds_gws_init:
1616 case Intrinsic::amdgcn_ds_gws_barrier:
1617 case Intrinsic::amdgcn_ds_gws_sema_v:
1618 case Intrinsic::amdgcn_ds_gws_sema_br:
1619 case Intrinsic::amdgcn_ds_gws_sema_p:
1620 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1621 Info.opc = ISD::INTRINSIC_VOID;
1622
1623 const GCNTargetMachine &TM =
1624 static_cast<const GCNTargetMachine &>(getTargetMachine());
1625
1626 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1627 Info.ptrVal = MFI->getGWSPSV(TM);
1628
1629 // This is an abstract access, but we need to specify a type and size.
1630 Info.memVT = MVT::i32;
1631 Info.size = 4;
1632 Info.align = Align(4);
1633
1634 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1635 Info.flags |= MachineMemOperand::MOLoad;
1636 else
1637 Info.flags |= MachineMemOperand::MOStore;
1638 Infos.push_back(Elt: Info);
1639 return;
1640 }
1641 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1642 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1643 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1644 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1645 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1646 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1647 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1648 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1649 Info.opc = ISD::INTRINSIC_VOID;
1650 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1651 Info.ptrVal = CI.getArgOperand(i: 1);
1652 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1653 Infos.push_back(Elt: Info);
1654 return;
1655 }
1656 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1657 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1658 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1659 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1660 Info.opc = ISD::INTRINSIC_VOID;
1661 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1662 Info.ptrVal = CI.getArgOperand(i: 0);
1663 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1664 Infos.push_back(Elt: Info);
1665 return;
1666 }
1667 case Intrinsic::amdgcn_load_to_lds:
1668 case Intrinsic::amdgcn_global_load_lds: {
1669 Info.opc = ISD::INTRINSIC_VOID;
1670 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1671 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1672 Info.ptrVal = CI.getArgOperand(i: 1);
1673 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1674 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1675 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1676 Info.flags |= MachineMemOperand::MOVolatile;
1677 Infos.push_back(Elt: Info);
1678 return;
1679 }
1680 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1681 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1682 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1683 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1684 Info.opc = ISD::INTRINSIC_W_CHAIN;
1685
1686 const GCNTargetMachine &TM =
1687 static_cast<const GCNTargetMachine &>(getTargetMachine());
1688
1689 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1690 Info.ptrVal = MFI->getGWSPSV(TM);
1691
1692 // This is an abstract access, but we need to specify a type and size.
1693 Info.memVT = MVT::i32;
1694 Info.size = 4;
1695 Info.align = Align(4);
1696
1697 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1698 Infos.push_back(Elt: Info);
1699 return;
1700 }
1701 case Intrinsic::amdgcn_s_prefetch_data:
1702 case Intrinsic::amdgcn_flat_prefetch:
1703 case Intrinsic::amdgcn_global_prefetch: {
1704 Info.opc = ISD::INTRINSIC_VOID;
1705 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: 8);
1706 Info.ptrVal = CI.getArgOperand(i: 0);
1707 Info.flags |= MachineMemOperand::MOLoad;
1708 Infos.push_back(Elt: Info);
1709 return;
1710 }
1711 default:
1712 return;
1713 }
1714}
1715
1716void SITargetLowering::CollectTargetIntrinsicOperands(
1717 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1718 switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1719 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1720 // The DAG's ValueType loses the addrspaces.
1721 // Add them as 2 extra Constant operands "from" and "to".
1722 unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1723 unsigned DstAS = I.getType()->getPointerAddressSpace();
1724 Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc(), VT: MVT::i32));
1725 Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc(), VT: MVT::i32));
1726 break;
1727 }
1728 default:
1729 break;
1730 }
1731}
1732
1733bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1734 SmallVectorImpl<Value *> &Ops,
1735 Type *&AccessTy) const {
1736 Value *Ptr = nullptr;
1737 switch (II->getIntrinsicID()) {
1738 case Intrinsic::amdgcn_cluster_load_b128:
1739 case Intrinsic::amdgcn_cluster_load_b64:
1740 case Intrinsic::amdgcn_cluster_load_b32:
1741 case Intrinsic::amdgcn_ds_append:
1742 case Intrinsic::amdgcn_ds_consume:
1743 case Intrinsic::amdgcn_ds_load_tr8_b64:
1744 case Intrinsic::amdgcn_ds_load_tr16_b128:
1745 case Intrinsic::amdgcn_ds_load_tr4_b64:
1746 case Intrinsic::amdgcn_ds_load_tr6_b96:
1747 case Intrinsic::amdgcn_ds_read_tr4_b64:
1748 case Intrinsic::amdgcn_ds_read_tr6_b96:
1749 case Intrinsic::amdgcn_ds_read_tr8_b64:
1750 case Intrinsic::amdgcn_ds_read_tr16_b64:
1751 case Intrinsic::amdgcn_ds_ordered_add:
1752 case Intrinsic::amdgcn_ds_ordered_swap:
1753 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1754 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1755 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1756 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1757 case Intrinsic::amdgcn_flat_load_monitor_b128:
1758 case Intrinsic::amdgcn_flat_load_monitor_b32:
1759 case Intrinsic::amdgcn_flat_load_monitor_b64:
1760 case Intrinsic::amdgcn_global_atomic_fmax_num:
1761 case Intrinsic::amdgcn_global_atomic_fmin_num:
1762 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1763 case Intrinsic::amdgcn_global_load_monitor_b128:
1764 case Intrinsic::amdgcn_global_load_monitor_b32:
1765 case Intrinsic::amdgcn_global_load_monitor_b64:
1766 case Intrinsic::amdgcn_global_load_tr_b64:
1767 case Intrinsic::amdgcn_global_load_tr_b128:
1768 case Intrinsic::amdgcn_global_load_tr4_b64:
1769 case Intrinsic::amdgcn_global_load_tr6_b96:
1770 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1771 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1772 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1773 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1774 Ptr = II->getArgOperand(i: 0);
1775 break;
1776 case Intrinsic::amdgcn_load_to_lds:
1777 case Intrinsic::amdgcn_global_load_lds:
1778 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1779 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1780 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1781 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1782 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1783 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1784 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1785 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1786 Ptr = II->getArgOperand(i: 1);
1787 break;
1788 default:
1789 return false;
1790 }
1791 AccessTy = II->getType();
1792 Ops.push_back(Elt: Ptr);
1793 return true;
1794}
1795
1796bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1797 unsigned AddrSpace) const {
1798 if (!Subtarget->hasFlatInstOffsets()) {
1799 // Flat instructions do not have offsets, and only have the register
1800 // address.
1801 return AM.BaseOffs == 0 && AM.Scale == 0;
1802 }
1803
1804 decltype(SIInstrFlags::FLAT) FlatVariant =
1805 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal
1806 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1807 : SIInstrFlags::FLAT;
1808
1809 return AM.Scale == 0 &&
1810 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1811 Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1812}
1813
1814bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1815 if (Subtarget->hasFlatGlobalInsts())
1816 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS);
1817
1818 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1819 // Assume the we will use FLAT for all global memory accesses
1820 // on VI.
1821 // FIXME: This assumption is currently wrong. On VI we still use
1822 // MUBUF instructions for the r + i addressing mode. As currently
1823 // implemented, the MUBUF instructions only work on buffer < 4GB.
1824 // It may be possible to support > 4GB buffers with MUBUF instructions,
1825 // by setting the stride value in the resource descriptor which would
1826 // increase the size limit to (stride * 4GB). However, this is risky,
1827 // because it has never been validated.
1828 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1829 }
1830
1831 return isLegalMUBUFAddressingMode(AM);
1832}
1833
1834bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1835 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1836 // additionally can do r + r + i with addr64. 32-bit has more addressing
1837 // mode options. Depending on the resource constant, it can also do
1838 // (i64 r0) + (i32 r1) * (i14 i).
1839 //
1840 // Private arrays end up using a scratch buffer most of the time, so also
1841 // assume those use MUBUF instructions. Scratch loads / stores are currently
1842 // implemented as mubuf instructions with offen bit set, so slightly
1843 // different than the normal addr64.
1844 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1845 if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1846 return false;
1847
1848 // FIXME: Since we can split immediate into soffset and immediate offset,
1849 // would it make sense to allow any immediate?
1850
1851 switch (AM.Scale) {
1852 case 0: // r + i or just i, depending on HasBaseReg.
1853 return true;
1854 case 1:
1855 return true; // We have r + r or r + i.
1856 case 2:
1857 if (AM.HasBaseReg) {
1858 // Reject 2 * r + r.
1859 return false;
1860 }
1861
1862 // Allow 2 * r as r + r
1863 // Or 2 * r + i is allowed as r + r + i.
1864 return true;
1865 default: // Don't allow n * r
1866 return false;
1867 }
1868}
1869
1870bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1871 const AddrMode &AM, Type *Ty,
1872 unsigned AS,
1873 Instruction *I) const {
1874 // No global is ever allowed as a base.
1875 if (AM.BaseGV)
1876 return false;
1877
1878 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1879 return isLegalGlobalAddressingMode(AM);
1880
1881 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1882 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1883 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1884 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1885 // If the offset isn't a multiple of 4, it probably isn't going to be
1886 // correctly aligned.
1887 // FIXME: Can we get the real alignment here?
1888 if (AM.BaseOffs % 4 != 0)
1889 return isLegalMUBUFAddressingMode(AM);
1890
1891 if (!Subtarget->hasScalarSubwordLoads()) {
1892 // There are no SMRD extloads, so if we have to do a small type access we
1893 // will use a MUBUF load.
1894 // FIXME?: We also need to do this if unaligned, but we don't know the
1895 // alignment here.
1896 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1897 return isLegalGlobalAddressingMode(AM);
1898 }
1899
1900 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1901 // SMRD instructions have an 8-bit, dword offset on SI.
1902 if (!isUInt<8>(x: AM.BaseOffs / 4))
1903 return false;
1904 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1905 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1906 // in 8-bits, it can use a smaller encoding.
1907 if (!isUInt<32>(x: AM.BaseOffs / 4))
1908 return false;
1909 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1910 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1911 if (!isUInt<20>(x: AM.BaseOffs))
1912 return false;
1913 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1914 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1915 // for S_BUFFER_* instructions).
1916 if (!isInt<21>(x: AM.BaseOffs))
1917 return false;
1918 } else {
1919 // On GFX12, all offsets are signed 24-bit in bytes.
1920 if (!isInt<24>(x: AM.BaseOffs))
1921 return false;
1922 }
1923
1924 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1925 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1926 AM.BaseOffs < 0) {
1927 // Scalar (non-buffer) loads can only use a negative offset if
1928 // soffset+offset is non-negative. Since the compiler can only prove that
1929 // in a few special cases, it is safer to claim that negative offsets are
1930 // not supported.
1931 return false;
1932 }
1933
1934 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1935 return true;
1936
1937 if (AM.Scale == 1 && AM.HasBaseReg)
1938 return true;
1939
1940 return false;
1941 }
1942
1943 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1944 return Subtarget->hasFlatScratchEnabled()
1945 ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)
1946 : isLegalMUBUFAddressingMode(AM);
1947
1948 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1949 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1950 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1951 // field.
1952 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1953 // an 8-bit dword offset but we don't know the alignment here.
1954 if (!isUInt<16>(x: AM.BaseOffs))
1955 return false;
1956
1957 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1958 return true;
1959
1960 if (AM.Scale == 1 && AM.HasBaseReg)
1961 return true;
1962
1963 return false;
1964 }
1965
1966 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1967 // For an unknown address space, this usually means that this is for some
1968 // reason being used for pure arithmetic, and not based on some addressing
1969 // computation. We don't have instructions that compute pointers with any
1970 // addressing modes, so treat them as having no offset like flat
1971 // instructions.
1972 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1973 }
1974
1975 // Assume a user alias of global for unknown address spaces.
1976 return isLegalGlobalAddressingMode(AM);
1977}
1978
1979bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1980 const MachineFunction &MF) const {
1981 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1982 return (MemVT.getSizeInBits() <= 4 * 32);
1983 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1984 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1985 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1986 }
1987 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
1988 return (MemVT.getSizeInBits() <= 2 * 32);
1989 return true;
1990}
1991
1992bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1993 unsigned Size, unsigned AddrSpace, Align Alignment,
1994 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1995 if (IsFast)
1996 *IsFast = 0;
1997
1998 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1999 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2000 // Check if alignment requirements for ds_read/write instructions are
2001 // disabled.
2002 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2003 return false;
2004
2005 Align RequiredAlignment(
2006 PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))); // Natural alignment.
2007 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2008 Alignment < RequiredAlignment)
2009 return false;
2010
2011 // Either, the alignment requirements are "enabled", or there is an
2012 // unaligned LDS access related hardware bug though alignment requirements
2013 // are "disabled". In either case, we need to check for proper alignment
2014 // requirements.
2015 //
2016 switch (Size) {
2017 case 64:
2018 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2019 // address is negative, then the instruction is incorrectly treated as
2020 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2021 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2022 // load later in the SILoadStoreOptimizer.
2023 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2024 return false;
2025
2026 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2027 // can do a 4 byte aligned, 8 byte access in a single operation using
2028 // ds_read2/write2_b32 with adjacent offsets.
2029 RequiredAlignment = Align(4);
2030
2031 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2032 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2033 // ds_write2_b32 depending on the alignment. In either case with either
2034 // alignment there is no faster way of doing this.
2035
2036 // The numbers returned here and below are not additive, it is a 'speed
2037 // rank'. They are just meant to be compared to decide if a certain way
2038 // of lowering an operation is faster than another. For that purpose
2039 // naturally aligned operation gets it bitsize to indicate that "it
2040 // operates with a speed comparable to N-bit wide load". With the full
2041 // alignment ds128 is slower than ds96 for example. If underaligned it
2042 // is comparable to a speed of a single dword access, which would then
2043 // mean 32 < 128 and it is faster to issue a wide load regardless.
2044 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2045 // wider load which will not be aligned anymore the latter is slower.
2046 if (IsFast)
2047 *IsFast = (Alignment >= RequiredAlignment) ? 64
2048 : (Alignment < Align(4)) ? 32
2049 : 1;
2050 return true;
2051 }
2052
2053 break;
2054 case 96:
2055 if (!Subtarget->hasDS96AndDS128())
2056 return false;
2057
2058 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2059 // gfx8 and older.
2060
2061 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2062 // Naturally aligned access is fastest. However, also report it is Fast
2063 // if memory is aligned less than DWORD. A narrow load or store will be
2064 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2065 // be more of them, so overall we will pay less penalty issuing a single
2066 // instruction.
2067
2068 // See comment on the values above.
2069 if (IsFast)
2070 *IsFast = (Alignment >= RequiredAlignment) ? 96
2071 : (Alignment < Align(4)) ? 32
2072 : 1;
2073 return true;
2074 }
2075
2076 break;
2077 case 128:
2078 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2079 return false;
2080
2081 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2082 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2083 // single operation using ds_read2/write2_b64.
2084 RequiredAlignment = Align(8);
2085
2086 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2087 // Naturally aligned access is fastest. However, also report it is Fast
2088 // if memory is aligned less than DWORD. A narrow load or store will be
2089 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2090 // will be more of them, so overall we will pay less penalty issuing a
2091 // single instruction.
2092
2093 // See comment on the values above.
2094 if (IsFast)
2095 *IsFast = (Alignment >= RequiredAlignment) ? 128
2096 : (Alignment < Align(4)) ? 32
2097 : 1;
2098 return true;
2099 }
2100
2101 break;
2102 default:
2103 if (Size > 32)
2104 return false;
2105
2106 break;
2107 }
2108
2109 // See comment on the values above.
2110 // Note that we have a single-dword or sub-dword here, so if underaligned
2111 // it is a slowest possible access, hence returned value is 0.
2112 if (IsFast)
2113 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2114
2115 return Alignment >= RequiredAlignment ||
2116 Subtarget->hasUnalignedDSAccessEnabled();
2117 }
2118
2119 // FIXME: We have to be conservative here and assume that flat operations
2120 // will access scratch. If we had access to the IR function, then we
2121 // could determine if any private memory was used in the function.
2122 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2123 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2124 bool AlignedBy4 = Alignment >= Align(4);
2125 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2126 if (IsFast)
2127 *IsFast = AlignedBy4 ? Size : 1;
2128 return true;
2129 }
2130
2131 if (IsFast)
2132 *IsFast = AlignedBy4;
2133
2134 return AlignedBy4;
2135 }
2136
2137 // So long as they are correct, wide global memory operations perform better
2138 // than multiple smaller memory ops -- even when misaligned
2139 if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
2140 if (IsFast)
2141 *IsFast = Size;
2142
2143 return Alignment >= Align(4) ||
2144 Subtarget->hasUnalignedBufferAccessEnabled();
2145 }
2146
2147 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2148 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2149 // out-of-bounds behavior, but in the edge case where an access starts
2150 // out-of-bounds and then enter in-bounds, the entire access would be treated
2151 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2152 // natural alignment of buffer accesses.
2153 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2154 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2155 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2156 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2157 Alignment < Align(PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))))
2158 return false;
2159 }
2160
2161 // Smaller than dword value must be aligned.
2162 if (Size < 32)
2163 return false;
2164
2165 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2166 // byte-address are ignored, thus forcing Dword alignment.
2167 // This applies to private, global, and constant memory.
2168 if (IsFast)
2169 *IsFast = 1;
2170
2171 return Size >= 32 && Alignment >= Align(4);
2172}
2173
2174bool SITargetLowering::allowsMisalignedMemoryAccesses(
2175 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2176 unsigned *IsFast) const {
2177 return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
2178 Alignment, Flags, IsFast);
2179}
2180
2181EVT SITargetLowering::getOptimalMemOpType(
2182 LLVMContext &Context, const MemOp &Op,
2183 const AttributeList &FuncAttributes) const {
2184 // FIXME: Should account for address space here.
2185
2186 // The default fallback uses the private pointer size as a guess for a type to
2187 // use. Make sure we switch these to 64-bit accesses.
2188
2189 if (Op.size() >= 16 &&
2190 Op.isDstAligned(AlignCheck: Align(4))) // XXX: Should only do for global
2191 return MVT::v4i32;
2192
2193 if (Op.size() >= 8 && Op.isDstAligned(AlignCheck: Align(4)))
2194 return MVT::v2i32;
2195
2196 // Use the default.
2197 return MVT::Other;
2198}
2199
2200bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
2201 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
2202 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2203}
2204
2205bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2206 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2207 AS == AMDGPUAS::PRIVATE_ADDRESS;
2208}
2209
2210bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2211 unsigned DestAS) const {
2212 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2213 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2214 Subtarget->hasGloballyAddressableScratch()) {
2215 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2216 return false;
2217 }
2218
2219 // Flat -> private/local is a simple truncate.
2220 // Flat -> global is no-op
2221 return true;
2222 }
2223
2224 const GCNTargetMachine &TM =
2225 static_cast<const GCNTargetMachine &>(getTargetMachine());
2226 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2227}
2228
2229TargetLoweringBase::LegalizeTypeAction
2230SITargetLowering::getPreferredVectorAction(MVT VT) const {
2231 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2232 VT.getScalarType().bitsLE(VT: MVT::i16))
2233 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2234 return TargetLoweringBase::getPreferredVectorAction(VT);
2235}
2236
2237bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2238 Type *Ty) const {
2239 // FIXME: Could be smarter if called for vector constants.
2240 return true;
2241}
2242
2243bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2244 unsigned Index) const {
2245 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
2246 return false;
2247
2248 // TODO: Add more cases that are cheap.
2249 return Index == 0;
2250}
2251
2252bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2253 // TODO: This should be more aggressive, particular for 16-bit element
2254 // vectors. However there are some mixed improvements and regressions.
2255 EVT EltTy = VT.getVectorElementType();
2256 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2257 return EltTy.getSizeInBits() % MinAlign == 0;
2258}
2259
2260bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2261 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2262 switch (Op) {
2263 case ISD::LOAD:
2264 case ISD::STORE:
2265 return true;
2266 default:
2267 return false;
2268 }
2269 }
2270
2271 // SimplifySetCC uses this function to determine whether or not it should
2272 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2273 if (VT == MVT::i1 && Op == ISD::SETCC)
2274 return false;
2275
2276 return TargetLowering::isTypeDesirableForOp(Op, VT);
2277}
2278
2279MachinePointerInfo
2280SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
2281 // This isn't really a constant pool but close enough.
2282 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
2283 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
2284 return PtrInfo;
2285}
2286
2287SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2288 const SDLoc &SL,
2289 SDValue Chain,
2290 uint64_t Offset) const {
2291 const DataLayout &DL = DAG.getDataLayout();
2292 MachineFunction &MF = DAG.getMachineFunction();
2293 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2294 MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
2295
2296 auto [InputPtrReg, RC, ArgTy] =
2297 Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2298
2299 // We may not have the kernarg segment argument if we have no kernel
2300 // arguments.
2301 if (!InputPtrReg)
2302 return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
2303
2304 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2305 SDValue BasePtr = DAG.getCopyFromReg(
2306 Chain, dl: SL, Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
2307
2308 return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
2309}
2310
2311SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2312 const SDLoc &SL) const {
2313 uint64_t Offset =
2314 getImplicitParameterOffset(MF: DAG.getMachineFunction(), Param: FIRST_IMPLICIT);
2315 return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
2316}
2317
2318SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2319 const SDLoc &SL) const {
2320
2321 Function &F = DAG.getMachineFunction().getFunction();
2322 std::optional<uint32_t> KnownSize =
2323 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2324 if (KnownSize.has_value())
2325 return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32);
2326 return SDValue();
2327}
2328
2329SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2330 const SDLoc &SL, SDValue Val,
2331 bool Signed,
2332 const ISD::InputArg *Arg) const {
2333 // First, if it is a widened vector, narrow it.
2334 if (VT.isVector() &&
2335 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2336 EVT NarrowedVT =
2337 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
2338 NumElements: VT.getVectorNumElements());
2339 Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val,
2340 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
2341 }
2342
2343 // Then convert the vector elements or scalar value.
2344 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(VT: MemVT)) {
2345 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2346 Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2347 }
2348
2349 if (MemVT.isFloatingPoint()) {
2350 if (VT.isFloatingPoint()) {
2351 Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2352 } else {
2353 assert(!MemVT.isVector());
2354 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
2355 SDValue Cast = DAG.getBitcast(VT: IntVT, V: Val);
2356 Val = DAG.getAnyExtOrTrunc(Op: Cast, DL: SL, VT);
2357 }
2358 } else if (Signed)
2359 Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2360 else
2361 Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2362
2363 return Val;
2364}
2365
2366SDValue SITargetLowering::lowerKernargMemParameter(
2367 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2368 uint64_t Offset, Align Alignment, bool Signed,
2369 const ISD::InputArg *Arg) const {
2370
2371 MachinePointerInfo PtrInfo =
2372 getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
2373
2374 // Try to avoid using an extload by loading earlier than the argument address,
2375 // and extracting the relevant bits. The load should hopefully be merged with
2376 // the previous argument.
2377 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2378 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2379 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2380 int64_t OffsetDiff = Offset - AlignDownOffset;
2381
2382 EVT IntVT = MemVT.changeTypeToInteger();
2383
2384 // TODO: If we passed in the base kernel offset we could have a better
2385 // alignment than 4, but we don't really need it.
2386 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2387 SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr,
2388 PtrInfo: PtrInfo.getWithOffset(O: AlignDownOffset), Alignment: Align(4),
2389 MMOFlags: MachineMemOperand::MODereferenceable |
2390 MachineMemOperand::MOInvariant);
2391
2392 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL: SL, VT: MVT::i32);
2393 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt);
2394
2395 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2396 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2397 ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2398
2399 return DAG.getMergeValues(Ops: {ArgVal, Load.getValue(R: 1)}, dl: SL);
2400 }
2401
2402 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2403 SDValue Load = DAG.getLoad(
2404 VT: MemVT, dl: SL, Chain, Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
2405 MMOFlags: MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
2406
2407 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2408 return DAG.getMergeValues(Ops: {Val, Load.getValue(R: 1)}, dl: SL);
2409}
2410
2411/// Coerce an argument which was passed in a different ABI type to the original
2412/// expected value type.
2413SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2414 SDValue Val,
2415 CCValAssign &VA,
2416 const SDLoc &SL) const {
2417 EVT ValVT = VA.getValVT();
2418
2419 // If this is an 8 or 16-bit value, it is really passed promoted
2420 // to 32 bits. Insert an assert[sz]ext to capture this, then
2421 // truncate to the right size.
2422 switch (VA.getLocInfo()) {
2423 case CCValAssign::Full:
2424 return Val;
2425 case CCValAssign::BCvt:
2426 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ValVT, Operand: Val);
2427 case CCValAssign::SExt:
2428 Val = DAG.getNode(Opcode: ISD::AssertSext, DL: SL, VT: VA.getLocVT(), N1: Val,
2429 N2: DAG.getValueType(ValVT));
2430 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2431 case CCValAssign::ZExt:
2432 Val = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: VA.getLocVT(), N1: Val,
2433 N2: DAG.getValueType(ValVT));
2434 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2435 case CCValAssign::AExt:
2436 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2437 default:
2438 llvm_unreachable("Unknown loc info!");
2439 }
2440}
2441
2442SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2443 CCValAssign &VA, const SDLoc &SL,
2444 SDValue Chain,
2445 const ISD::InputArg &Arg) const {
2446 MachineFunction &MF = DAG.getMachineFunction();
2447 MachineFrameInfo &MFI = MF.getFrameInfo();
2448
2449 if (Arg.Flags.isByVal()) {
2450 unsigned Size = Arg.Flags.getByValSize();
2451 int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2452 return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32);
2453 }
2454
2455 unsigned ArgOffset = VA.getLocMemOffset();
2456 unsigned ArgSize = VA.getValVT().getStoreSize();
2457
2458 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2459
2460 // Create load nodes to retrieve arguments from the stack.
2461 SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32);
2462
2463 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2464 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2465 MVT MemVT = VA.getValVT();
2466
2467 switch (VA.getLocInfo()) {
2468 default:
2469 break;
2470 case CCValAssign::BCvt:
2471 MemVT = VA.getLocVT();
2472 break;
2473 case CCValAssign::SExt:
2474 ExtType = ISD::SEXTLOAD;
2475 break;
2476 case CCValAssign::ZExt:
2477 ExtType = ISD::ZEXTLOAD;
2478 break;
2479 case CCValAssign::AExt:
2480 ExtType = ISD::EXTLOAD;
2481 break;
2482 }
2483
2484 SDValue ArgValue = DAG.getExtLoad(
2485 ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2486 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), MemVT);
2487
2488 SDValue ConvertedVal = convertABITypeToValueType(DAG, Val: ArgValue, VA, SL);
2489 if (ConvertedVal == ArgValue)
2490 return ConvertedVal;
2491
2492 return DAG.getMergeValues(Ops: {ConvertedVal, ArgValue.getValue(R: 1)}, dl: SL);
2493}
2494
2495SDValue SITargetLowering::lowerWorkGroupId(
2496 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2497 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
2498 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
2499 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2500 if (!Subtarget->hasClusters())
2501 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2502
2503 // Clusters are supported. Return the global position in the grid. If clusters
2504 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2505
2506 // WorkGroupIdXYZ = ClusterId == 0 ?
2507 // ClusterIdXYZ :
2508 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2509 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2510 SDLoc SL(ClusterIdXYZ);
2511 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2512 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT);
2513 SDValue ClusterSizeXYZ = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterMaxIdXYZ, N2: One);
2514 SDValue ClusterWorkGroupIdXYZ =
2515 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2516 SDValue GlobalIdXYZ =
2517 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterWorkGroupIdXYZ,
2518 N2: DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: ClusterIdXYZ, N2: ClusterSizeXYZ));
2519
2520 switch (MFI.getClusterDims().getKind()) {
2521 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
2522 case AMDGPU::ClusterDimsAttr::Kind::VariableDims:
2523 return GlobalIdXYZ;
2524 case AMDGPU::ClusterDimsAttr::Kind::NoCluster:
2525 return ClusterIdXYZ;
2526 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
2527 using namespace AMDGPU::Hwreg;
2528 SDValue ClusterIdField =
2529 DAG.getTargetConstant(Val: HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4), DL: SL, VT);
2530 SDNode *GetReg =
2531 DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT, Op1: ClusterIdField);
2532 SDValue ClusterId(GetReg, 0);
2533 SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT);
2534 return DAG.getNode(Opcode: ISD::SELECT_CC, DL: SL, VT, N1: ClusterId, N2: Zero, N3: ClusterIdXYZ,
2535 N4: GlobalIdXYZ, N5: DAG.getCondCode(Cond: ISD::SETEQ));
2536 }
2537 }
2538
2539 llvm_unreachable("nothing should reach here");
2540}
2541
2542SDValue SITargetLowering::getPreloadedValue(
2543 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2544 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2545 const ArgDescriptor *Reg = nullptr;
2546 const TargetRegisterClass *RC;
2547 LLT Ty;
2548
2549 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2550 const ArgDescriptor WorkGroupIDX =
2551 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
2552 // If GridZ is not programmed in an entry function then the hardware will set
2553 // it to all zeros, so there is no need to mask the GridY value in the low
2554 // order bits.
2555 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2556 Reg: AMDGPU::TTMP7,
2557 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2558 const ArgDescriptor WorkGroupIDZ =
2559 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
2560 const ArgDescriptor ClusterWorkGroupIDX =
2561 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
2562 const ArgDescriptor ClusterWorkGroupIDY =
2563 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
2564 const ArgDescriptor ClusterWorkGroupIDZ =
2565 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
2566 const ArgDescriptor ClusterWorkGroupMaxIDX =
2567 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
2568 const ArgDescriptor ClusterWorkGroupMaxIDY =
2569 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
2570 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2571 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
2572 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2573 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
2574
2575 auto LoadConstant = [&](unsigned N) {
2576 return DAG.getConstant(Val: N, DL: SDLoc(), VT);
2577 };
2578
2579 if (Subtarget->hasArchitectedSGPRs() &&
2580 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2581 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2582 bool HasFixedDims = ClusterDims.isFixedDims();
2583
2584 switch (PVID) {
2585 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2586 Reg = &WorkGroupIDX;
2587 RC = &AMDGPU::SReg_32RegClass;
2588 Ty = LLT::scalar(SizeInBits: 32);
2589 break;
2590 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2591 Reg = &WorkGroupIDY;
2592 RC = &AMDGPU::SReg_32RegClass;
2593 Ty = LLT::scalar(SizeInBits: 32);
2594 break;
2595 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2596 Reg = &WorkGroupIDZ;
2597 RC = &AMDGPU::SReg_32RegClass;
2598 Ty = LLT::scalar(SizeInBits: 32);
2599 break;
2600 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
2601 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2602 return LoadConstant(0);
2603 Reg = &ClusterWorkGroupIDX;
2604 RC = &AMDGPU::SReg_32RegClass;
2605 Ty = LLT::scalar(SizeInBits: 32);
2606 break;
2607 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
2608 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2609 return LoadConstant(0);
2610 Reg = &ClusterWorkGroupIDY;
2611 RC = &AMDGPU::SReg_32RegClass;
2612 Ty = LLT::scalar(SizeInBits: 32);
2613 break;
2614 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
2615 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2616 return LoadConstant(0);
2617 Reg = &ClusterWorkGroupIDZ;
2618 RC = &AMDGPU::SReg_32RegClass;
2619 Ty = LLT::scalar(SizeInBits: 32);
2620 break;
2621 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
2622 if (HasFixedDims)
2623 return LoadConstant(ClusterDims.getDims()[0] - 1);
2624 Reg = &ClusterWorkGroupMaxIDX;
2625 RC = &AMDGPU::SReg_32RegClass;
2626 Ty = LLT::scalar(SizeInBits: 32);
2627 break;
2628 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
2629 if (HasFixedDims)
2630 return LoadConstant(ClusterDims.getDims()[1] - 1);
2631 Reg = &ClusterWorkGroupMaxIDY;
2632 RC = &AMDGPU::SReg_32RegClass;
2633 Ty = LLT::scalar(SizeInBits: 32);
2634 break;
2635 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
2636 if (HasFixedDims)
2637 return LoadConstant(ClusterDims.getDims()[2] - 1);
2638 Reg = &ClusterWorkGroupMaxIDZ;
2639 RC = &AMDGPU::SReg_32RegClass;
2640 Ty = LLT::scalar(SizeInBits: 32);
2641 break;
2642 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
2643 Reg = &ClusterWorkGroupMaxFlatID;
2644 RC = &AMDGPU::SReg_32RegClass;
2645 Ty = LLT::scalar(SizeInBits: 32);
2646 break;
2647 default:
2648 break;
2649 }
2650 }
2651
2652 if (!Reg)
2653 std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2654 if (!Reg) {
2655 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2656 // It's possible for a kernarg intrinsic call to appear in a kernel with
2657 // no allocated segment, in which case we do not add the user sgpr
2658 // argument, so just return null.
2659 return DAG.getConstant(Val: 0, DL: SDLoc(), VT);
2660 }
2661
2662 // It's undefined behavior if a function marked with the amdgpu-no-*
2663 // attributes uses the corresponding intrinsic.
2664 return DAG.getPOISON(VT);
2665 }
2666
2667 return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg);
2668}
2669
2670static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2671 CallingConv::ID CallConv,
2672 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2673 FunctionType *FType,
2674 SIMachineFunctionInfo *Info) {
2675 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2676 const ISD::InputArg *Arg = &Ins[I];
2677
2678 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2679 "vector type argument should have been split");
2680
2681 // First check if it's a PS input addr.
2682 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2683 PSInputNum <= 15) {
2684 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2685
2686 // Inconveniently only the first part of the split is marked as isSplit,
2687 // so skip to the end. We only want to increment PSInputNum once for the
2688 // entire split argument.
2689 if (Arg->Flags.isSplit()) {
2690 while (!Arg->Flags.isSplitEnd()) {
2691 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2692 "unexpected vector split in ps argument type");
2693 if (!SkipArg)
2694 Splits.push_back(Elt: *Arg);
2695 Arg = &Ins[++I];
2696 }
2697 }
2698
2699 if (SkipArg) {
2700 // We can safely skip PS inputs.
2701 Skipped.set(Arg->getOrigArgIndex());
2702 ++PSInputNum;
2703 continue;
2704 }
2705
2706 Info->markPSInputAllocated(Index: PSInputNum);
2707 if (Arg->Used)
2708 Info->markPSInputEnabled(Index: PSInputNum);
2709
2710 ++PSInputNum;
2711 }
2712
2713 Splits.push_back(Elt: *Arg);
2714 }
2715}
2716
2717// Allocate special inputs passed in VGPRs.
2718void SITargetLowering::allocateSpecialEntryInputVGPRs(
2719 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2720 SIMachineFunctionInfo &Info) const {
2721 const LLT S32 = LLT::scalar(SizeInBits: 32);
2722 MachineRegisterInfo &MRI = MF.getRegInfo();
2723
2724 if (Info.hasWorkItemIDX()) {
2725 Register Reg = AMDGPU::VGPR0;
2726 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2727
2728 CCInfo.AllocateReg(Reg);
2729 unsigned Mask =
2730 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2731 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2732 }
2733
2734 if (Info.hasWorkItemIDY()) {
2735 assert(Info.hasWorkItemIDX());
2736 if (Subtarget->hasPackedTID()) {
2737 Info.setWorkItemIDY(
2738 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 10));
2739 } else {
2740 unsigned Reg = AMDGPU::VGPR1;
2741 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2742
2743 CCInfo.AllocateReg(Reg);
2744 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2745 }
2746 }
2747
2748 if (Info.hasWorkItemIDZ()) {
2749 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2750 if (Subtarget->hasPackedTID()) {
2751 Info.setWorkItemIDZ(
2752 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 20));
2753 } else {
2754 unsigned Reg = AMDGPU::VGPR2;
2755 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2756
2757 CCInfo.AllocateReg(Reg);
2758 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2759 }
2760 }
2761}
2762
2763// Try to allocate a VGPR at the end of the argument list, or if no argument
2764// VGPRs are left allocating a stack slot.
2765// If \p Mask is is given it indicates bitfield position in the register.
2766// If \p Arg is given use it with new ]p Mask instead of allocating new.
2767static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2768 ArgDescriptor Arg = ArgDescriptor()) {
2769 if (Arg.isSet())
2770 return ArgDescriptor::createArg(Arg, Mask);
2771
2772 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2773 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2774 if (RegIdx == ArgVGPRs.size()) {
2775 // Spill to stack required.
2776 int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
2777
2778 return ArgDescriptor::createStack(Offset, Mask);
2779 }
2780
2781 unsigned Reg = ArgVGPRs[RegIdx];
2782 Reg = CCInfo.AllocateReg(Reg);
2783 assert(Reg != AMDGPU::NoRegister);
2784
2785 MachineFunction &MF = CCInfo.getMachineFunction();
2786 Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass);
2787 MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32));
2788 return ArgDescriptor::createRegister(Reg, Mask);
2789}
2790
2791static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2792 const TargetRegisterClass *RC,
2793 unsigned NumArgRegs) {
2794 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2795 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2796 if (RegIdx == ArgSGPRs.size())
2797 report_fatal_error(reason: "ran out of SGPRs for arguments");
2798
2799 unsigned Reg = ArgSGPRs[RegIdx];
2800 Reg = CCInfo.AllocateReg(Reg);
2801 assert(Reg != AMDGPU::NoRegister);
2802
2803 MachineFunction &MF = CCInfo.getMachineFunction();
2804 MF.addLiveIn(PReg: Reg, RC);
2805 return ArgDescriptor::createRegister(Reg);
2806}
2807
2808// If this has a fixed position, we still should allocate the register in the
2809// CCInfo state. Technically we could get away with this for values passed
2810// outside of the normal argument range.
2811static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2812 const TargetRegisterClass *RC,
2813 MCRegister Reg) {
2814 Reg = CCInfo.AllocateReg(Reg);
2815 assert(Reg != AMDGPU::NoRegister);
2816 MachineFunction &MF = CCInfo.getMachineFunction();
2817 MF.addLiveIn(PReg: Reg, RC);
2818}
2819
2820static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2821 if (Arg) {
2822 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass,
2823 Reg: Arg.getRegister());
2824 } else
2825 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: 32);
2826}
2827
2828static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2829 if (Arg) {
2830 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass,
2831 Reg: Arg.getRegister());
2832 } else
2833 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: 16);
2834}
2835
2836/// Allocate implicit function VGPR arguments at the end of allocated user
2837/// arguments.
2838void SITargetLowering::allocateSpecialInputVGPRs(
2839 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2840 SIMachineFunctionInfo &Info) const {
2841 const unsigned Mask = 0x3ff;
2842 ArgDescriptor Arg;
2843
2844 if (Info.hasWorkItemIDX()) {
2845 Arg = allocateVGPR32Input(CCInfo, Mask);
2846 Info.setWorkItemIDX(Arg);
2847 }
2848
2849 if (Info.hasWorkItemIDY()) {
2850 Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg);
2851 Info.setWorkItemIDY(Arg);
2852 }
2853
2854 if (Info.hasWorkItemIDZ())
2855 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg));
2856}
2857
2858/// Allocate implicit function VGPR arguments in fixed registers.
2859void SITargetLowering::allocateSpecialInputVGPRsFixed(
2860 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2861 SIMachineFunctionInfo &Info) const {
2862 Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31);
2863 if (!Reg)
2864 report_fatal_error(reason: "failed to allocate VGPR for implicit arguments");
2865
2866 const unsigned Mask = 0x3ff;
2867 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2868 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10));
2869 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20));
2870}
2871
2872void SITargetLowering::allocateSpecialInputSGPRs(
2873 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2874 SIMachineFunctionInfo &Info) const {
2875 auto &ArgInfo = Info.getArgInfo();
2876 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2877
2878 // TODO: Unify handling with private memory pointers.
2879 if (UserSGPRInfo.hasDispatchPtr())
2880 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
2881
2882 if (UserSGPRInfo.hasQueuePtr())
2883 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
2884
2885 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2886 // constant offset from the kernarg segment.
2887 if (Info.hasImplicitArgPtr())
2888 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
2889
2890 if (UserSGPRInfo.hasDispatchID())
2891 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
2892
2893 // flat_scratch_init is not applicable for non-kernel functions.
2894
2895 if (Info.hasWorkGroupIDX())
2896 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
2897
2898 if (Info.hasWorkGroupIDY())
2899 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
2900
2901 if (Info.hasWorkGroupIDZ())
2902 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
2903
2904 if (Info.hasLDSKernelId())
2905 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
2906}
2907
2908// Allocate special inputs passed in user SGPRs.
2909void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2910 MachineFunction &MF,
2911 const SIRegisterInfo &TRI,
2912 SIMachineFunctionInfo &Info) const {
2913 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2914 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2915 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2916 MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2917 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
2918 }
2919
2920 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2921 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2922 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2923 MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
2924 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
2925 }
2926
2927 if (UserSGPRInfo.hasDispatchPtr()) {
2928 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2929 MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2930 CCInfo.AllocateReg(Reg: DispatchPtrReg);
2931 }
2932
2933 if (UserSGPRInfo.hasQueuePtr()) {
2934 Register QueuePtrReg = Info.addQueuePtr(TRI);
2935 MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
2936 CCInfo.AllocateReg(Reg: QueuePtrReg);
2937 }
2938
2939 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2940 MachineRegisterInfo &MRI = MF.getRegInfo();
2941 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2942 CCInfo.AllocateReg(Reg: InputPtrReg);
2943
2944 Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2945 MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2946 }
2947
2948 if (UserSGPRInfo.hasDispatchID()) {
2949 Register DispatchIDReg = Info.addDispatchID(TRI);
2950 MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
2951 CCInfo.AllocateReg(Reg: DispatchIDReg);
2952 }
2953
2954 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2955 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2956 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
2957 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
2958 }
2959
2960 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2961 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2962 MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
2963 CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
2964 }
2965
2966 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2967 // these from the dispatch pointer.
2968}
2969
2970// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2971// sequential starting from the first argument.
2972void SITargetLowering::allocatePreloadKernArgSGPRs(
2973 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2974 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2975 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2976 Function &F = MF.getFunction();
2977 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2978 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2979 bool InPreloadSequence = true;
2980 unsigned InIdx = 0;
2981 bool AlignedForImplictArgs = false;
2982 unsigned ImplicitArgOffset = 0;
2983 for (auto &Arg : F.args()) {
2984 if (!InPreloadSequence || !Arg.hasInRegAttr())
2985 break;
2986
2987 unsigned ArgIdx = Arg.getArgNo();
2988 // Don't preload non-original args or parts not in the current preload
2989 // sequence.
2990 if (InIdx < Ins.size() &&
2991 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2992 break;
2993
2994 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2995 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2996 InIdx++) {
2997 assert(ArgLocs[ArgIdx].isMemLoc());
2998 auto &ArgLoc = ArgLocs[InIdx];
2999 const Align KernelArgBaseAlign = Align(16);
3000 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3001 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
3002 unsigned NumAllocSGPRs =
3003 alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32;
3004
3005 // Fix alignment for hidden arguments.
3006 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
3007 if (!AlignedForImplictArgs) {
3008 ImplicitArgOffset =
3009 alignTo(Size: LastExplicitArgOffset,
3010 A: Subtarget->getAlignmentForImplicitArgPtr()) -
3011 LastExplicitArgOffset;
3012 AlignedForImplictArgs = true;
3013 }
3014 ArgOffset += ImplicitArgOffset;
3015 }
3016
3017 // Arg is preloaded into the previous SGPR.
3018 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3019 assert(InIdx >= 1 && "No previous SGPR");
3020 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3021 Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3022 continue;
3023 }
3024
3025 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3026 unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4;
3027 // Check for free user SGPRs for preloading.
3028 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3029 InPreloadSequence = false;
3030 break;
3031 }
3032
3033 // Preload this argument.
3034 const TargetRegisterClass *RC =
3035 TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32);
3036 SmallVectorImpl<MCRegister> *PreloadRegs =
3037 Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
3038
3039 if (PreloadRegs->size() > 1)
3040 RC = &AMDGPU::SGPR_32RegClass;
3041 for (auto &Reg : *PreloadRegs) {
3042 assert(Reg);
3043 MF.addLiveIn(PReg: Reg, RC);
3044 CCInfo.AllocateReg(Reg);
3045 }
3046
3047 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3048 }
3049 }
3050}
3051
3052void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
3053 const SIRegisterInfo &TRI,
3054 SIMachineFunctionInfo &Info) const {
3055 // Always allocate this last since it is a synthetic preload.
3056 if (Info.hasLDSKernelId()) {
3057 Register Reg = Info.addLDSKernelId();
3058 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3059 CCInfo.AllocateReg(Reg);
3060 }
3061}
3062
3063// Allocate special input registers that are initialized per-wave.
3064void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
3065 SIMachineFunctionInfo &Info,
3066 CallingConv::ID CallConv,
3067 bool IsShader) const {
3068 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3069 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3070 // Note: user SGPRs are handled by the front-end for graphics shaders
3071 // Pad up the used user SGPRs with dead inputs.
3072
3073 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3074 // before enabling architected SGPRs for workgroup IDs.
3075 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3076
3077 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3078 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3079 // rely on it to reach 16 since if we end up having no stack usage, it will
3080 // not really be added.
3081 unsigned NumRequiredSystemSGPRs =
3082 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3083 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3084 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3085 Register Reg = Info.addReservedUserSGPR();
3086 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3087 CCInfo.AllocateReg(Reg);
3088 }
3089 }
3090
3091 if (!HasArchitectedSGPRs) {
3092 if (Info.hasWorkGroupIDX()) {
3093 Register Reg = Info.addWorkGroupIDX();
3094 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3095 CCInfo.AllocateReg(Reg);
3096 }
3097
3098 if (Info.hasWorkGroupIDY()) {
3099 Register Reg = Info.addWorkGroupIDY();
3100 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3101 CCInfo.AllocateReg(Reg);
3102 }
3103
3104 if (Info.hasWorkGroupIDZ()) {
3105 Register Reg = Info.addWorkGroupIDZ();
3106 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3107 CCInfo.AllocateReg(Reg);
3108 }
3109 }
3110
3111 if (Info.hasWorkGroupInfo()) {
3112 Register Reg = Info.addWorkGroupInfo();
3113 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3114 CCInfo.AllocateReg(Reg);
3115 }
3116
3117 if (Info.hasPrivateSegmentWaveByteOffset()) {
3118 // Scratch wave offset passed in system SGPR.
3119 unsigned PrivateSegmentWaveByteOffsetReg;
3120
3121 if (IsShader) {
3122 PrivateSegmentWaveByteOffsetReg =
3123 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3124
3125 // This is true if the scratch wave byte offset doesn't have a fixed
3126 // location.
3127 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3128 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3129 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3130 }
3131 } else
3132 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3133
3134 MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass);
3135 CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
3136 }
3137
3138 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3139 Info.getNumPreloadedSGPRs() >= 16);
3140}
3141
3142static void reservePrivateMemoryRegs(const TargetMachine &TM,
3143 MachineFunction &MF,
3144 const SIRegisterInfo &TRI,
3145 SIMachineFunctionInfo &Info) {
3146 // Now that we've figured out where the scratch register inputs are, see if
3147 // should reserve the arguments and use them directly.
3148 MachineFrameInfo &MFI = MF.getFrameInfo();
3149 bool HasStackObjects = MFI.hasStackObjects();
3150 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3151
3152 // Record that we know we have non-spill stack objects so we don't need to
3153 // check all stack objects later.
3154 if (HasStackObjects)
3155 Info.setHasNonSpillStackObjects(true);
3156
3157 // Everything live out of a block is spilled with fast regalloc, so it's
3158 // almost certain that spilling will be required.
3159 if (TM.getOptLevel() == CodeGenOptLevel::None)
3160 HasStackObjects = true;
3161
3162 // For now assume stack access is needed in any callee functions, so we need
3163 // the scratch registers to pass in.
3164 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3165
3166 if (!ST.hasFlatScratchEnabled()) {
3167 if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) {
3168 // If we have stack objects, we unquestionably need the private buffer
3169 // resource. For the Code Object V2 ABI, this will be the first 4 user
3170 // SGPR inputs. We can reserve those and use them directly.
3171
3172 Register PrivateSegmentBufferReg =
3173 Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
3174 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3175 } else {
3176 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3177 // We tentatively reserve the last registers (skipping the last registers
3178 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3179 // we'll replace these with the ones immediately after those which were
3180 // really allocated. In the prologue copies will be inserted from the
3181 // argument to these reserved registers.
3182
3183 // Without HSA, relocations are used for the scratch pointer and the
3184 // buffer resource setup is always inserted in the prologue. Scratch wave
3185 // offset is still in an input SGPR.
3186 Info.setScratchRSrcReg(ReservedBufferReg);
3187 }
3188 }
3189
3190 MachineRegisterInfo &MRI = MF.getRegInfo();
3191
3192 // For entry functions we have to set up the stack pointer if we use it,
3193 // whereas non-entry functions get this "for free". This means there is no
3194 // intrinsic advantage to using S32 over S34 in cases where we do not have
3195 // calls but do need a frame pointer (i.e. if we are requested to have one
3196 // because frame pointer elimination is disabled). To keep things simple we
3197 // only ever use S32 as the call ABI stack pointer, and so using it does not
3198 // imply we need a separate frame pointer.
3199 //
3200 // Try to use s32 as the SP, but move it if it would interfere with input
3201 // arguments. This won't work with calls though.
3202 //
3203 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3204 // registers.
3205 if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) {
3206 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3207 } else {
3208 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
3209
3210 if (MFI.hasCalls())
3211 report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
3212
3213 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3214 if (!MRI.isLiveIn(Reg)) {
3215 Info.setStackPtrOffsetReg(Reg);
3216 break;
3217 }
3218 }
3219
3220 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3221 report_fatal_error(reason: "failed to find register for SP");
3222 }
3223
3224 // hasFP should be accurate for entry functions even before the frame is
3225 // finalized, because it does not rely on the known stack size, only
3226 // properties like whether variable sized objects are present.
3227 if (ST.getFrameLowering()->hasFP(MF)) {
3228 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3229 }
3230}
3231
3232bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
3233 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3234 return !Info->isEntryFunction();
3235}
3236
3237void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}
3238
3239void SITargetLowering::insertCopiesSplitCSR(
3240 MachineBasicBlock *Entry,
3241 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3242 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3243
3244 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
3245 if (!IStart)
3246 return;
3247
3248 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3249 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3250 MachineBasicBlock::iterator MBBI = Entry->begin();
3251 for (const MCPhysReg *I = IStart; *I; ++I) {
3252 const TargetRegisterClass *RC = nullptr;
3253 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3254 RC = &AMDGPU::SGPR_64RegClass;
3255 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3256 RC = &AMDGPU::SGPR_32RegClass;
3257 else
3258 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3259
3260 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
3261 // Create copy from CSR to a virtual register.
3262 Entry->addLiveIn(PhysReg: *I);
3263 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
3264 .addReg(RegNo: *I);
3265
3266 // Insert the copy-back instructions right before the terminator.
3267 for (auto *Exit : Exits)
3268 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
3269 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
3270 .addReg(RegNo: NewVR);
3271 }
3272}
3273
3274SDValue SITargetLowering::LowerFormalArguments(
3275 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3276 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3277 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3278 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3279
3280 MachineFunction &MF = DAG.getMachineFunction();
3281 const Function &Fn = MF.getFunction();
3282 FunctionType *FType = MF.getFunction().getFunctionType();
3283 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3284 bool IsError = false;
3285
3286 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
3287 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3288 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3289 IsError = true;
3290 }
3291
3292 SmallVector<ISD::InputArg, 16> Splits;
3293 SmallVector<CCValAssign, 16> ArgLocs;
3294 BitVector Skipped(Ins.size());
3295 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3296 *DAG.getContext());
3297
3298 bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
3299 bool IsKernel = AMDGPU::isKernel(CC: CallConv);
3300 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
3301
3302 if (IsGraphics) {
3303 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3304 assert(!UserSGPRInfo.hasDispatchPtr() &&
3305 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3306 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3307 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3308 (void)UserSGPRInfo;
3309 if (!Subtarget->hasFlatScratchEnabled())
3310 assert(!UserSGPRInfo.hasFlatScratchInit());
3311 if ((CallConv != CallingConv::AMDGPU_CS &&
3312 CallConv != CallingConv::AMDGPU_Gfx &&
3313 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3314 !Subtarget->hasArchitectedSGPRs())
3315 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3316 !Info->hasWorkGroupIDZ());
3317 }
3318
3319 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3320
3321 if (CallConv == CallingConv::AMDGPU_PS) {
3322 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3323
3324 // At least one interpolation mode must be enabled or else the GPU will
3325 // hang.
3326 //
3327 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3328 // set PSInputAddr, the user wants to enable some bits after the compilation
3329 // based on run-time states. Since we can't know what the final PSInputEna
3330 // will look like, so we shouldn't do anything here and the user should take
3331 // responsibility for the correct programming.
3332 //
3333 // Otherwise, the following restrictions apply:
3334 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3335 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3336 // enabled too.
3337 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3338 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) {
3339 CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
3340 CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
3341 Info->markPSInputAllocated(Index: 0);
3342 Info->markPSInputEnabled(Index: 0);
3343 }
3344 if (Subtarget->isAmdPalOS()) {
3345 // For isAmdPalOS, the user does not enable some bits after compilation
3346 // based on run-time states; the register values being generated here are
3347 // the final ones set in hardware. Therefore we need to apply the
3348 // workaround to PSInputAddr and PSInputEnable together. (The case where
3349 // a bit is set in PSInputAddr but not PSInputEnable is where the
3350 // frontend set up an input arg for a particular interpolation mode, but
3351 // nothing uses that input arg. Really we should have an earlier pass
3352 // that removes such an arg.)
3353 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3354 if ((PsInputBits & 0x7F) == 0 ||
3355 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3356 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
3357 }
3358 } else if (IsKernel) {
3359 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3360 } else {
3361 Splits.append(in_start: IsWholeWaveFunc ? std::next(x: Ins.begin()) : Ins.begin(),
3362 in_end: Ins.end());
3363 }
3364
3365 if (IsKernel)
3366 analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
3367
3368 if (IsEntryFunc) {
3369 allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3370 allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3371 if (IsKernel && Subtarget->hasKernargPreload())
3372 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info);
3373
3374 allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info);
3375 } else if (!IsGraphics) {
3376 // For the fixed ABI, pass workitem IDs in the last argument register.
3377 allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
3378
3379 // FIXME: Sink this into allocateSpecialInputSGPRs
3380 if (!Subtarget->hasFlatScratchEnabled())
3381 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
3382
3383 allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3384 }
3385
3386 if (!IsKernel) {
3387 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
3388 CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
3389
3390 // This assumes the registers are allocated by CCInfo in ascending order
3391 // with no gaps.
3392 Info->setNumWaveDispatchSGPRs(
3393 CCInfo.getFirstUnallocated(Regs: AMDGPU::SGPR_32RegClass.getRegisters()));
3394 Info->setNumWaveDispatchVGPRs(
3395 CCInfo.getFirstUnallocated(Regs: AMDGPU::VGPR_32RegClass.getRegisters()));
3396 } else if (Info->getNumKernargPreloadedSGPRs()) {
3397 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3398 }
3399
3400 SmallVector<SDValue, 16> Chains;
3401
3402 if (IsWholeWaveFunc) {
3403 SDValue Setup = DAG.getNode(Opcode: AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3404 ResultTys: {MVT::i1, MVT::Other}, Ops: Chain);
3405 InVals.push_back(Elt: Setup.getValue(R: 0));
3406 Chains.push_back(Elt: Setup.getValue(R: 1));
3407 }
3408
3409 // FIXME: This is the minimum kernel argument alignment. We should improve
3410 // this to the maximum alignment of the arguments.
3411 //
3412 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3413 // kern arg offset.
3414 const Align KernelArgBaseAlign = Align(16);
3415
3416 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3417 ++i) {
3418 const ISD::InputArg &Arg = Ins[i];
3419 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3420 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3421 continue;
3422 }
3423
3424 CCValAssign &VA = ArgLocs[ArgIdx++];
3425 MVT VT = VA.getLocVT();
3426
3427 if (IsEntryFunc && VA.isMemLoc()) {
3428 VT = Ins[i].VT;
3429 EVT MemVT = VA.getLocVT();
3430
3431 const uint64_t Offset = VA.getLocMemOffset();
3432 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
3433
3434 if (Arg.Flags.isByRef()) {
3435 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
3436
3437 const GCNTargetMachine &TM =
3438 static_cast<const GCNTargetMachine &>(getTargetMachine());
3439 if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3440 DestAS: Arg.Flags.getPointerAddrSpace())) {
3441 Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3442 DestAS: Arg.Flags.getPointerAddrSpace());
3443 }
3444
3445 InVals.push_back(Elt: Ptr);
3446 continue;
3447 }
3448
3449 SDValue NewArg;
3450 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
3451 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3452 // In this case the argument is packed into the previous preload SGPR.
3453 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
3454 int64_t OffsetDiff = Offset - AlignDownOffset;
3455 EVT IntVT = MemVT.changeTypeToInteger();
3456
3457 const SIMachineFunctionInfo *Info =
3458 MF.getInfo<SIMachineFunctionInfo>();
3459 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3460 Register Reg =
3461 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0];
3462
3463 assert(Reg);
3464 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3465 SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3466
3467 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL, VT: MVT::i32);
3468 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt);
3469
3470 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
3471 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
3472 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
3473 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3474
3475 NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL);
3476 } else {
3477 const SIMachineFunctionInfo *Info =
3478 MF.getInfo<SIMachineFunctionInfo>();
3479 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3480 const SmallVectorImpl<MCRegister> &PreloadRegs =
3481 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
3482
3483 SDValue Copy;
3484 if (PreloadRegs.size() == 1) {
3485 Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]);
3486 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
3487 NewArg = DAG.getCopyFromReg(
3488 Chain, dl: DL, Reg: VReg,
3489 VT: EVT::getIntegerVT(Context&: *DAG.getContext(),
3490 BitWidth: TRI->getRegSizeInBits(RC: *RC)));
3491
3492 } else {
3493 // If the kernarg alignment does not match the alignment of the SGPR
3494 // tuple RC that can accommodate this argument, it will be built up
3495 // via copies from from the individual SGPRs that the argument was
3496 // preloaded to.
3497 SmallVector<SDValue, 4> Elts;
3498 for (auto Reg : PreloadRegs) {
3499 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3500 Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3501 Elts.push_back(Elt: Copy);
3502 }
3503 NewArg =
3504 DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
3505 NumElements: PreloadRegs.size()),
3506 DL, Ops: Elts);
3507 }
3508
3509 // If the argument was preloaded to multiple consecutive 32-bit
3510 // registers because of misalignment between addressable SGPR tuples
3511 // and the argument size, we can still assume that because of kernarg
3512 // segment alignment restrictions that NewArg's size is the same as
3513 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3514 // truncate since we cannot preload to less than a single SGPR and the
3515 // MemVT may be smaller.
3516 EVT MemVTInt =
3517 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
3518 if (MemVT.bitsLT(VT: NewArg.getSimpleValueType()))
3519 NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg);
3520
3521 NewArg = DAG.getBitcast(VT: MemVT, V: NewArg);
3522 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg,
3523 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3524 NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
3525 }
3526 } else {
3527 // Hidden arguments that are in the kernel signature must be preloaded
3528 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3529 // the argument list and is not preloaded.
3530 if (Arg.isOrigArg()) {
3531 Argument *OrigArg = Fn.getArg(i: Arg.getOrigArgIndex());
3532 if (OrigArg->hasAttribute(Kind: "amdgpu-hidden-argument")) {
3533 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3534 *OrigArg->getParent(),
3535 "hidden argument in kernel signature was not preloaded",
3536 DL.getDebugLoc()));
3537 }
3538 }
3539
3540 NewArg =
3541 lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
3542 Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3543 }
3544 Chains.push_back(Elt: NewArg.getValue(R: 1));
3545
3546 auto *ParamTy =
3547 dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex()));
3548 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3549 ParamTy &&
3550 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3551 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3552 // On SI local pointers are just offsets into LDS, so they are always
3553 // less than 16-bits. On CI and newer they could potentially be
3554 // real pointers, so we can't guarantee their size.
3555 NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg,
3556 N2: DAG.getValueType(MVT::i16));
3557 }
3558
3559 InVals.push_back(Elt: NewArg);
3560 continue;
3561 }
3562 if (!IsEntryFunc && VA.isMemLoc()) {
3563 SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3564 InVals.push_back(Elt: Val);
3565 if (!Arg.Flags.isByVal())
3566 Chains.push_back(Elt: Val.getValue(R: 1));
3567 continue;
3568 }
3569
3570 assert(VA.isRegLoc() && "Parameter must be in a register!");
3571
3572 Register Reg = VA.getLocReg();
3573 const TargetRegisterClass *RC = nullptr;
3574 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3575 RC = &AMDGPU::VGPR_32RegClass;
3576 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3577 RC = &AMDGPU::SGPR_32RegClass;
3578 else
3579 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3580
3581 Reg = MF.addLiveIn(PReg: Reg, RC);
3582 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3583
3584 if (Arg.Flags.isSRet()) {
3585 // The return object should be reasonably addressable.
3586
3587 // FIXME: This helps when the return is a real sret. If it is a
3588 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3589 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3590 unsigned NumBits =
3591 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3592 Val = DAG.getNode(
3593 Opcode: ISD::AssertZext, DL, VT, N1: Val,
3594 N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3595 }
3596
3597 Val = convertABITypeToValueType(DAG, Val, VA, SL: DL);
3598 InVals.push_back(Elt: Val);
3599 }
3600
3601 // Start adding system SGPRs.
3602 if (IsEntryFunc)
3603 allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3604
3605 if (DAG.getPass()) {
3606 auto &ArgUsageInfo =
3607 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>();
3608 ArgUsageInfo.getArgUsageInfo().setFuncArgInfo(F: Fn, ArgInfo: Info->getArgInfo());
3609 } else if (auto *MFAM = DAG.getMFAM()) {
3610 Module &M = *MF.getFunction().getParent();
3611 auto *ArgUsageInfo =
3612 MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(IR&: MF)
3613 .getCachedResult<AMDGPUArgumentUsageAnalysis>(IR&: M);
3614 if (ArgUsageInfo)
3615 ArgUsageInfo->setFuncArgInfo(F: Fn, ArgInfo: Info->getArgInfo());
3616 }
3617
3618 unsigned StackArgSize = CCInfo.getStackSize();
3619 Info->setBytesInStackArgArea(StackArgSize);
3620
3621 return Chains.empty() ? Chain
3622 : DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains);
3623}
3624
3625// TODO: If return values can't fit in registers, we should return as many as
3626// possible in registers before passing on stack.
3627bool SITargetLowering::CanLowerReturn(
3628 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3629 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3630 const Type *RetTy) const {
3631 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3632 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3633 // for shaders. Vector types should be explicitly handled by CC.
3634 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3635 return true;
3636
3637 SmallVector<CCValAssign, 16> RVLocs;
3638 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3639 if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3640 return false;
3641
3642 // We must use the stack if return would require unavailable registers.
3643 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3644 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3645 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3646 if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i)))
3647 return false;
3648
3649 return true;
3650}
3651
3652SDValue
3653SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3654 bool isVarArg,
3655 const SmallVectorImpl<ISD::OutputArg> &Outs,
3656 const SmallVectorImpl<SDValue> &OutVals,
3657 const SDLoc &DL, SelectionDAG &DAG) const {
3658 MachineFunction &MF = DAG.getMachineFunction();
3659 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3660 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3661
3662 if (AMDGPU::isKernel(CC: CallConv)) {
3663 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3664 OutVals, DL, DAG);
3665 }
3666
3667 bool IsShader = AMDGPU::isShader(CC: CallConv);
3668
3669 Info->setIfReturnsVoid(Outs.empty());
3670 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3671
3672 // CCValAssign - represent the assignment of the return value to a location.
3673 SmallVector<CCValAssign, 48> RVLocs;
3674
3675 // CCState - Info about the registers and stack slots.
3676 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3677 *DAG.getContext());
3678
3679 // Analyze outgoing return values.
3680 CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3681
3682 SDValue Glue;
3683 SmallVector<SDValue, 48> RetOps;
3684 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3685
3686 SDValue ReadFirstLane =
3687 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3688 // Copy the result values into the output registers.
3689 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3690 ++I, ++RealRVLocIdx) {
3691 CCValAssign &VA = RVLocs[I];
3692 assert(VA.isRegLoc() && "Can only return in registers!");
3693 // TODO: Partially return in registers if return values don't fit.
3694 SDValue Arg = OutVals[RealRVLocIdx];
3695
3696 // Copied from other backends.
3697 switch (VA.getLocInfo()) {
3698 case CCValAssign::Full:
3699 break;
3700 case CCValAssign::BCvt:
3701 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3702 break;
3703 case CCValAssign::SExt:
3704 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3705 break;
3706 case CCValAssign::ZExt:
3707 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3708 break;
3709 case CCValAssign::AExt:
3710 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3711 break;
3712 default:
3713 llvm_unreachable("Unknown loc info!");
3714 }
3715 if (TRI->isSGPRPhysReg(Reg: VA.getLocReg()))
3716 Arg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Arg.getValueType(),
3717 N1: ReadFirstLane, N2: Arg);
3718 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3719 Glue = Chain.getValue(R: 1);
3720 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3721 }
3722
3723 // FIXME: Does sret work properly?
3724 if (!Info->isEntryFunction()) {
3725 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3726 const MCPhysReg *I =
3727 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3728 if (I) {
3729 for (; *I; ++I) {
3730 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3731 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
3732 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3733 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32));
3734 else
3735 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3736 }
3737 }
3738 }
3739
3740 // Update chain and glue.
3741 RetOps[0] = Chain;
3742 if (Glue.getNode())
3743 RetOps.push_back(Elt: Glue);
3744
3745 unsigned Opc = AMDGPUISD::ENDPGM;
3746 if (!IsWaveEnd)
3747 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3748 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3749 : AMDGPUISD::RET_GLUE;
3750 return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps);
3751}
3752
3753SDValue SITargetLowering::LowerCallResult(
3754 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3755 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3756 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3757 SDValue ThisVal) const {
3758 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3759
3760 // Assign locations to each value returned by this call.
3761 SmallVector<CCValAssign, 16> RVLocs;
3762 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3763 *DAG.getContext());
3764 CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3765
3766 // Copy all of the result registers out of their specified physreg.
3767 for (CCValAssign VA : RVLocs) {
3768 SDValue Val;
3769
3770 if (VA.isRegLoc()) {
3771 Val =
3772 DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3773 Chain = Val.getValue(R: 1);
3774 InGlue = Val.getValue(R: 2);
3775 } else if (VA.isMemLoc()) {
3776 report_fatal_error(reason: "TODO: return values in memory");
3777 } else
3778 llvm_unreachable("unknown argument location type");
3779
3780 switch (VA.getLocInfo()) {
3781 case CCValAssign::Full:
3782 break;
3783 case CCValAssign::BCvt:
3784 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3785 break;
3786 case CCValAssign::ZExt:
3787 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3788 N2: DAG.getValueType(VA.getValVT()));
3789 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3790 break;
3791 case CCValAssign::SExt:
3792 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3793 N2: DAG.getValueType(VA.getValVT()));
3794 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3795 break;
3796 case CCValAssign::AExt:
3797 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3798 break;
3799 default:
3800 llvm_unreachable("Unknown loc info!");
3801 }
3802
3803 InVals.push_back(Elt: Val);
3804 }
3805
3806 return Chain;
3807}
3808
3809// Add code to pass special inputs required depending on used features separate
3810// from the explicit user arguments present in the IR.
3811void SITargetLowering::passSpecialInputs(
3812 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3813 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3814 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3815 // If we don't have a call site, this was a call inserted by
3816 // legalization. These can never use special inputs.
3817 if (!CLI.CB)
3818 return;
3819
3820 SelectionDAG &DAG = CLI.DAG;
3821 const SDLoc &DL = CLI.DL;
3822 const Function &F = DAG.getMachineFunction().getFunction();
3823
3824 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3825 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3826
3827 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3828 &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3829 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3830 if (DAG.getPass()) {
3831 auto &ArgUsageInfo =
3832 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfoWrapperLegacy>();
3833 CalleeArgInfo =
3834 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(F: *CalleeFunc);
3835 } else if (auto *MFAM = DAG.getMFAM()) {
3836 Module &M = *DAG.getMachineFunction().getFunction().getParent();
3837 auto *ArgUsageInfo =
3838 MFAM->getResult<ModuleAnalysisManagerMachineFunctionProxy>(
3839 IR&: DAG.getMachineFunction())
3840 .getCachedResult<AMDGPUArgumentUsageAnalysis>(IR&: M);
3841 if (ArgUsageInfo)
3842 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(F: *CalleeFunc);
3843 }
3844 }
3845
3846 // TODO: Unify with private memory register handling. This is complicated by
3847 // the fact that at least in kernels, the input argument is not necessarily
3848 // in the same location as the input.
3849 // clang-format off
3850 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3851 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3852 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3853 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3854 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3855 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3856 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3857 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3858 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3859 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3860 };
3861 // clang-format on
3862
3863 for (auto [InputID, Attrs] : ImplicitAttrs) {
3864 // If the callee does not use the attribute value, skip copying the value.
3865 if (all_of(Range&: Attrs, P: [&](StringRef Attr) {
3866 return Attr.empty() || CLI.CB->hasFnAttr(Kind: Attr);
3867 }))
3868 continue;
3869
3870 const auto [OutgoingArg, ArgRC, ArgTy] =
3871 CalleeArgInfo->getPreloadedValue(Value: InputID);
3872 if (!OutgoingArg)
3873 continue;
3874
3875 const auto [IncomingArg, IncomingArgRC, Ty] =
3876 CallerArgInfo.getPreloadedValue(Value: InputID);
3877 assert(IncomingArgRC == ArgRC);
3878
3879 // All special arguments are ints for now.
3880 EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == 8 ? MVT::i64 : MVT::i32;
3881 SDValue InputReg;
3882
3883 if (IncomingArg) {
3884 InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
3885 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3886 // The implicit arg ptr is special because it doesn't have a corresponding
3887 // input for kernels, and is computed from the kernarg segment pointer.
3888 InputReg = getImplicitArgPtr(DAG, SL: DL);
3889 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3890 std::optional<uint32_t> Id =
3891 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3892 if (Id.has_value()) {
3893 InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
3894 } else {
3895 InputReg = DAG.getPOISON(VT: ArgVT);
3896 }
3897 } else {
3898 // We may have proven the input wasn't needed, although the ABI is
3899 // requiring it. We just need to allocate the register appropriately.
3900 InputReg = DAG.getPOISON(VT: ArgVT);
3901 }
3902
3903 if (OutgoingArg->isRegister()) {
3904 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3905 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
3906 report_fatal_error(reason: "failed to allocate implicit input argument");
3907 } else {
3908 unsigned SpecialArgOffset =
3909 CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4));
3910 SDValue ArgStore =
3911 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
3912 MemOpChains.push_back(Elt: ArgStore);
3913 }
3914 }
3915
3916 // Pack workitem IDs into a single register or pass it as is if already
3917 // packed.
3918
3919 auto [OutgoingArg, ArgRC, Ty] =
3920 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3921 if (!OutgoingArg)
3922 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3923 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3924 if (!OutgoingArg)
3925 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3926 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3927 if (!OutgoingArg)
3928 return;
3929
3930 const ArgDescriptor *IncomingArgX = std::get<0>(
3931 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3932 const ArgDescriptor *IncomingArgY = std::get<0>(
3933 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3934 const ArgDescriptor *IncomingArgZ = std::get<0>(
3935 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3936
3937 SDValue InputReg;
3938 SDLoc SL;
3939
3940 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
3941 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
3942 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
3943
3944 // If incoming ids are not packed we need to pack them.
3945 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3946 NeedWorkItemIDX) {
3947 if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 0) != 0) {
3948 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX);
3949 } else {
3950 InputReg = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3951 }
3952 }
3953
3954 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3955 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 1) != 0) {
3956 SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY);
3957 Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y,
3958 N2: DAG.getShiftAmountConstant(Val: 10, VT: MVT::i32, DL: SL));
3959 InputReg = InputReg.getNode()
3960 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y)
3961 : Y;
3962 }
3963
3964 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3965 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 2) != 0) {
3966 SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ);
3967 Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z,
3968 N2: DAG.getShiftAmountConstant(Val: 20, VT: MVT::i32, DL: SL));
3969 InputReg = InputReg.getNode()
3970 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z)
3971 : Z;
3972 }
3973
3974 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3975 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3976 // We're in a situation where the outgoing function requires the workitem
3977 // ID, but the calling function does not have it (e.g a graphics function
3978 // calling a C calling convention function). This is illegal, but we need
3979 // to produce something.
3980 InputReg = DAG.getPOISON(VT: MVT::i32);
3981 } else {
3982 // Workitem ids are already packed, any of present incoming arguments
3983 // will carry all required fields.
3984 ArgDescriptor IncomingArg =
3985 ArgDescriptor::createArg(Arg: IncomingArgX ? *IncomingArgX
3986 : IncomingArgY ? *IncomingArgY
3987 : *IncomingArgZ,
3988 Mask: ~0u);
3989 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg);
3990 }
3991 }
3992
3993 if (OutgoingArg->isRegister()) {
3994 if (InputReg)
3995 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3996
3997 CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
3998 } else {
3999 unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
4000 if (InputReg) {
4001 SDValue ArgStore =
4002 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
4003 MemOpChains.push_back(Elt: ArgStore);
4004 }
4005 }
4006}
4007
4008bool SITargetLowering::isEligibleForTailCallOptimization(
4009 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4010 const SmallVectorImpl<ISD::OutputArg> &Outs,
4011 const SmallVectorImpl<SDValue> &OutVals,
4012 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4013 if (AMDGPU::isChainCC(CC: CalleeCC))
4014 return true;
4015
4016 if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC))
4017 return false;
4018
4019 // For a divergent call target, we need to do a waterfall loop over the
4020 // possible callees which precludes us from using a simple jump.
4021 if (Callee->isDivergent())
4022 return false;
4023
4024 MachineFunction &MF = DAG.getMachineFunction();
4025 const Function &CallerF = MF.getFunction();
4026 CallingConv::ID CallerCC = CallerF.getCallingConv();
4027 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
4028 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4029
4030 // Kernels aren't callable, and don't have a live in return address so it
4031 // doesn't make sense to do a tail call with entry functions.
4032 if (!CallerPreserved)
4033 return false;
4034
4035 bool CCMatch = CallerCC == CalleeCC;
4036
4037 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4038 if (AMDGPU::canGuaranteeTCO(CC: CalleeCC) && CCMatch)
4039 return true;
4040 return false;
4041 }
4042
4043 // TODO: Can we handle var args?
4044 if (IsVarArg)
4045 return false;
4046
4047 for (const Argument &Arg : CallerF.args()) {
4048 if (Arg.hasByValAttr())
4049 return false;
4050 }
4051
4052 LLVMContext &Ctx = *DAG.getContext();
4053
4054 // Check that the call results are passed in the same way.
4055 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
4056 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
4057 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
4058 return false;
4059
4060 // The callee has to preserve all registers the caller needs to preserve.
4061 if (!CCMatch) {
4062 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4063 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
4064 return false;
4065 }
4066
4067 // Nothing more to check if the callee is taking no arguments.
4068 if (Outs.empty())
4069 return true;
4070
4071 SmallVector<CCValAssign, 16> ArgLocs;
4072 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4073
4074 // FIXME: We are not allocating special input registers, so we will be
4075 // deciding based on incorrect register assignments.
4076 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
4077
4078 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4079 // If the stack arguments for this call do not fit into our own save area then
4080 // the call cannot be made tail.
4081 // TODO: Is this really necessary?
4082 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4083 return false;
4084
4085 for (const auto &[CCVA, ArgVal] : zip_equal(t&: ArgLocs, u: OutVals)) {
4086 // FIXME: What about inreg arguments that end up passed in memory?
4087 if (!CCVA.isRegLoc())
4088 continue;
4089
4090 // If we are passing an argument in an SGPR, and the value is divergent,
4091 // this call requires a waterfall loop.
4092 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(Reg: CCVA.getLocReg())) {
4093 LLVM_DEBUG(
4094 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4095 << printReg(CCVA.getLocReg(), TRI) << '\n');
4096 return false;
4097 }
4098 }
4099
4100 const MachineRegisterInfo &MRI = MF.getRegInfo();
4101 return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
4102}
4103
4104bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
4105 if (!CI->isTailCall())
4106 return false;
4107
4108 const Function *ParentFn = CI->getFunction();
4109 if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
4110 return false;
4111 return true;
4112}
4113
4114namespace {
4115// Chain calls have special arguments that we need to handle. These are
4116// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4117// arguments (index 0 and 1 respectively).
4118enum ChainCallArgIdx {
4119 Exec = 2,
4120 Flags,
4121 NumVGPRs,
4122 FallbackExec,
4123 FallbackCallee
4124};
4125} // anonymous namespace
4126
4127// The wave scratch offset register is used as the global base pointer.
4128SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
4129 SmallVectorImpl<SDValue> &InVals) const {
4130 CallingConv::ID CallConv = CLI.CallConv;
4131 bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
4132
4133 SelectionDAG &DAG = CLI.DAG;
4134
4135 const SDLoc &DL = CLI.DL;
4136 SDValue Chain = CLI.Chain;
4137 SDValue Callee = CLI.Callee;
4138
4139 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4140 bool UsesDynamicVGPRs = false;
4141 if (IsChainCallConv) {
4142 // The last arguments should be the value that we need to put in EXEC,
4143 // followed by the flags and any other arguments with special meanings.
4144 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4145 // we don't treat them like the "real" arguments.
4146 auto RequestedExecIt =
4147 llvm::find_if(Range&: CLI.Outs, P: [](const ISD::OutputArg &Arg) {
4148 return Arg.OrigArgIndex == 2;
4149 });
4150 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4151
4152 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4153 CLI.OutVals.erase(CS: CLI.OutVals.begin() + SpecialArgsBeginIdx,
4154 CE: CLI.OutVals.end());
4155 CLI.Outs.erase(CS: RequestedExecIt, CE: CLI.Outs.end());
4156
4157 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4158 "Haven't popped all the special args");
4159
4160 TargetLowering::ArgListEntry RequestedExecArg =
4161 CLI.Args[ChainCallArgIdx::Exec];
4162 if (!RequestedExecArg.Ty->isIntegerTy(Bitwidth: Subtarget->getWavefrontSize()))
4163 return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
4164
4165 // Convert constants into TargetConstants, so they become immediate operands
4166 // instead of being selected into S_MOV.
4167 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4168 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Val&: Arg.Node)) {
4169 ChainCallSpecialArgs.push_back(Elt: DAG.getTargetConstant(
4170 Val: ArgNode->getAPIntValue(), DL, VT: ArgNode->getValueType(ResNo: 0)));
4171 } else
4172 ChainCallSpecialArgs.push_back(Elt: Arg.Node);
4173 };
4174
4175 PushNodeOrTargetConstant(RequestedExecArg);
4176
4177 // Process any other special arguments depending on the value of the flags.
4178 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4179
4180 const APInt &FlagsValue = cast<ConstantSDNode>(Val&: Flags.Node)->getAPIntValue();
4181 if (FlagsValue.isZero()) {
4182 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4183 return lowerUnhandledCall(CLI, InVals,
4184 Reason: "no additional args allowed if flags == 0");
4185 } else if (FlagsValue.isOneBitSet(BitNo: 0)) {
4186 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4187 return lowerUnhandledCall(CLI, InVals, Reason: "expected 3 additional args");
4188 }
4189
4190 if (!Subtarget->isWave32()) {
4191 return lowerUnhandledCall(
4192 CLI, InVals, Reason: "dynamic VGPR mode is only supported for wave32");
4193 }
4194
4195 UsesDynamicVGPRs = true;
4196 std::for_each(first: CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4197 last: CLI.Args.end(), f: PushNodeOrTargetConstant);
4198 }
4199 }
4200
4201 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
4202 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4203 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
4204 bool &IsTailCall = CLI.IsTailCall;
4205 bool IsVarArg = CLI.IsVarArg;
4206 bool IsSibCall = false;
4207 MachineFunction &MF = DAG.getMachineFunction();
4208
4209 if (Callee.isUndef() || isNullConstant(V: Callee)) {
4210 if (!CLI.IsTailCall) {
4211 for (ISD::InputArg &Arg : CLI.Ins)
4212 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
4213 }
4214
4215 return Chain;
4216 }
4217
4218 if (IsVarArg) {
4219 return lowerUnhandledCall(CLI, InVals,
4220 Reason: "unsupported call to variadic function ");
4221 }
4222
4223 if (!CLI.CB)
4224 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported libcall legalization");
4225
4226 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4227 return lowerUnhandledCall(CLI, InVals,
4228 Reason: "unsupported required tail call to function ");
4229 }
4230
4231 if (IsTailCall) {
4232 IsTailCall = isEligibleForTailCallOptimization(Callee, CalleeCC: CallConv, IsVarArg,
4233 Outs, OutVals, Ins, DAG);
4234 if (!IsTailCall &&
4235 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4236 report_fatal_error(reason: "failed to perform tail call elimination on a call "
4237 "site marked musttail or on llvm.amdgcn.cs.chain");
4238 }
4239
4240 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4241
4242 // A sibling call is one where we're under the usual C ABI and not planning
4243 // to change that but can still do a tail call:
4244 if (!TailCallOpt && IsTailCall)
4245 IsSibCall = true;
4246
4247 if (IsTailCall)
4248 ++NumTailCalls;
4249 }
4250
4251 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4252 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
4253 SmallVector<SDValue, 8> MemOpChains;
4254
4255 // Analyze operands of the call, assigning locations to each operand.
4256 SmallVector<CCValAssign, 16> ArgLocs;
4257 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4258 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
4259
4260 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv) &&
4261 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
4262 // With a fixed ABI, allocate fixed registers before user arguments.
4263 passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
4264 }
4265
4266 CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
4267
4268 // Get a count of how many bytes are to be pushed on the stack.
4269 unsigned NumBytes = CCInfo.getStackSize();
4270
4271 if (IsSibCall) {
4272 // Since we're not changing the ABI to make this a tail call, the memory
4273 // operands are already available in the caller's incoming argument space.
4274 NumBytes = 0;
4275 }
4276
4277 // FPDiff is the byte offset of the call's argument area from the callee's.
4278 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4279 // by this amount for a tail call. In a sibling call it must be 0 because the
4280 // caller will deallocate the entire stack and the callee still expects its
4281 // arguments to begin at SP+0. Completely unused for non-tail calls.
4282 int32_t FPDiff = 0;
4283 MachineFrameInfo &MFI = MF.getFrameInfo();
4284 auto *TRI = Subtarget->getRegisterInfo();
4285
4286 // Adjust the stack pointer for the new arguments...
4287 // These operations are automatically eliminated by the prolog/epilog pass
4288 if (!IsSibCall)
4289 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL);
4290
4291 if (!IsSibCall || IsChainCallConv) {
4292 if (!Subtarget->hasFlatScratchEnabled()) {
4293 SmallVector<SDValue, 4> CopyFromChains;
4294
4295 // In the HSA case, this should be an identity copy.
4296 SDValue ScratchRSrcReg =
4297 DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
4298 RegsToPass.emplace_back(Args: IsChainCallConv
4299 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4300 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4301 Args&: ScratchRSrcReg);
4302 CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1));
4303 Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
4304 }
4305 }
4306
4307 const unsigned NumSpecialInputs = RegsToPass.size();
4308
4309 MVT PtrVT = MVT::i32;
4310
4311 // Walk the register/memloc assignments, inserting copies/loads.
4312 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4313 CCValAssign &VA = ArgLocs[i];
4314 SDValue Arg = OutVals[i];
4315
4316 // Promote the value if needed.
4317 switch (VA.getLocInfo()) {
4318 case CCValAssign::Full:
4319 break;
4320 case CCValAssign::BCvt:
4321 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
4322 break;
4323 case CCValAssign::ZExt:
4324 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4325 break;
4326 case CCValAssign::SExt:
4327 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4328 break;
4329 case CCValAssign::AExt:
4330 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4331 break;
4332 case CCValAssign::FPExt:
4333 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4334 break;
4335 default:
4336 llvm_unreachable("Unknown loc info!");
4337 }
4338
4339 if (VA.isRegLoc()) {
4340 RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
4341 } else {
4342 assert(VA.isMemLoc());
4343
4344 SDValue DstAddr;
4345 MachinePointerInfo DstInfo;
4346
4347 unsigned LocMemOffset = VA.getLocMemOffset();
4348 int32_t Offset = LocMemOffset;
4349
4350 SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
4351 MaybeAlign Alignment;
4352
4353 if (IsTailCall) {
4354 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4355 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4356 : VA.getValVT().getStoreSize();
4357
4358 // FIXME: We can have better than the minimum byval required alignment.
4359 Alignment =
4360 Flags.isByVal()
4361 ? Flags.getNonZeroByValAlign()
4362 : commonAlignment(A: Subtarget->getStackAlignment(), Offset);
4363
4364 Offset = Offset + FPDiff;
4365 int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
4366
4367 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
4368 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4369
4370 // Make sure any stack arguments overlapping with where we're storing
4371 // are loaded before this eventual operation. Otherwise they'll be
4372 // clobbered.
4373
4374 // FIXME: Why is this really necessary? This seems to just result in a
4375 // lot of code to copy the stack and write them back to the same
4376 // locations, which are supposed to be immutable?
4377 Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
4378 } else {
4379 // Stores to the argument stack area are relative to the stack pointer.
4380 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(),
4381 VT: MVT::i32);
4382 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff);
4383 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
4384 Alignment =
4385 commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
4386 }
4387
4388 if (Outs[i].Flags.isByVal()) {
4389 SDValue SizeNode =
4390 DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i32);
4391 SDValue Cpy =
4392 DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
4393 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
4394 /*isVol = */ false, /*AlwaysInline = */ true,
4395 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo,
4396 SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
4397
4398 MemOpChains.push_back(Elt: Cpy);
4399 } else {
4400 SDValue Store =
4401 DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
4402 MemOpChains.push_back(Elt: Store);
4403 }
4404 }
4405 }
4406
4407 if (!MemOpChains.empty())
4408 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
4409
4410 SDValue ReadFirstLaneID =
4411 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4412
4413 SDValue TokenGlue;
4414 if (CLI.ConvergenceControlToken) {
4415 TokenGlue = DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL, VT: MVT::Glue,
4416 Operand: CLI.ConvergenceControlToken);
4417 }
4418
4419 // Build a sequence of copy-to-reg nodes chained together with token chain
4420 // and flag operands which copy the outgoing args into the appropriate regs.
4421 SDValue InGlue;
4422
4423 unsigned ArgIdx = 0;
4424 for (auto [Reg, Val] : RegsToPass) {
4425 if (ArgIdx++ >= NumSpecialInputs &&
4426 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4427 // For chain calls, the inreg arguments are required to be
4428 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4429 // they are uniform.
4430 //
4431 // For other calls, if an inreg arguments is known to be uniform,
4432 // speculatively insert a readfirstlane in case it is in a VGPR.
4433 //
4434 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4435 // value, so let that continue to produce invalid code.
4436
4437 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4438 if (TokenGlue)
4439 ReadfirstlaneArgs.push_back(Elt: TokenGlue);
4440 Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
4441 Ops: ReadfirstlaneArgs);
4442 }
4443
4444 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: Val, Glue: InGlue);
4445 InGlue = Chain.getValue(R: 1);
4446 }
4447
4448 // We don't usually want to end the call-sequence here because we would tidy
4449 // the frame up *after* the call, however in the ABI-changing tail-call case
4450 // we've carefully laid out the parameters so that when sp is reset they'll be
4451 // in the correct location.
4452 if (IsTailCall && !IsSibCall) {
4453 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL);
4454 InGlue = Chain.getValue(R: 1);
4455 }
4456
4457 std::vector<SDValue> Ops({Chain});
4458
4459 // Add a redundant copy of the callee global which will not be legalized, as
4460 // we need direct access to the callee later.
4461 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
4462 const GlobalValue *GV = GSD->getGlobal();
4463 Ops.push_back(x: Callee);
4464 Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64));
4465 } else {
4466 if (IsTailCall) {
4467 // isEligibleForTailCallOptimization considered whether the call target is
4468 // divergent, but we may still end up with a uniform value in a VGPR.
4469 // Insert a readfirstlane just in case.
4470 SDValue ReadFirstLaneID =
4471 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4472
4473 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4474 if (TokenGlue)
4475 ReadfirstlaneArgs.push_back(Elt: TokenGlue); // Wire up convergence token.
4476 Callee = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Callee.getValueType(),
4477 Ops: ReadfirstlaneArgs);
4478 }
4479
4480 Ops.push_back(x: Callee);
4481 Ops.push_back(x: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64));
4482 }
4483
4484 if (IsTailCall) {
4485 // Each tail call may have to adjust the stack by a different amount, so
4486 // this information must travel along with the operation for eventual
4487 // consumption by emitEpilogue.
4488 Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
4489 }
4490
4491 if (IsChainCallConv)
4492 llvm::append_range(C&: Ops, R&: ChainCallSpecialArgs);
4493
4494 // Add argument registers to the end of the list so that they are known live
4495 // into the call.
4496 for (auto &[Reg, Val] : RegsToPass)
4497 Ops.push_back(x: DAG.getRegister(Reg, VT: Val.getValueType()));
4498
4499 // Add a register mask operand representing the call-preserved registers.
4500 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4501 assert(Mask && "Missing call preserved mask for calling convention");
4502 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
4503
4504 if (SDValue Token = CLI.ConvergenceControlToken) {
4505 SmallVector<SDValue, 2> GlueOps;
4506 GlueOps.push_back(Elt: Token);
4507 if (InGlue)
4508 GlueOps.push_back(Elt: InGlue);
4509
4510 InGlue = SDValue(DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL,
4511 VT: MVT::Glue, Ops: GlueOps),
4512 0);
4513 }
4514
4515 if (InGlue)
4516 Ops.push_back(x: InGlue);
4517
4518 // If we're doing a tall call, use a TC_RETURN here rather than an
4519 // actual call instruction.
4520 if (IsTailCall) {
4521 MFI.setHasTailCall();
4522 unsigned OPC = AMDGPUISD::TC_RETURN;
4523 switch (CallConv) {
4524 case CallingConv::AMDGPU_Gfx:
4525 OPC = AMDGPUISD::TC_RETURN_GFX;
4526 break;
4527 case CallingConv::AMDGPU_CS_Chain:
4528 case CallingConv::AMDGPU_CS_ChainPreserve:
4529 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4530 : AMDGPUISD::TC_RETURN_CHAIN;
4531 break;
4532 }
4533
4534 // If the caller is a whole wave function, we need to use a special opcode
4535 // so we can patch up EXEC.
4536 if (Info->isWholeWaveFunction())
4537 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4538
4539 return DAG.getNode(Opcode: OPC, DL, VT: MVT::Other, Ops);
4540 }
4541
4542 // Returns a chain and a flag for retval copy to use.
4543 SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
4544 Chain = Call.getValue(R: 0);
4545 InGlue = Call.getValue(R: 1);
4546
4547 uint64_t CalleePopBytes = NumBytes;
4548 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL);
4549 if (!Ins.empty())
4550 InGlue = Chain.getValue(R: 1);
4551
4552 // Handle result values, copying them out of physregs into vregs that we
4553 // return.
4554 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4555 InVals, /*IsThisReturn=*/false, ThisVal: SDValue());
4556}
4557
4558// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4559// except for:
4560// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4561// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4562SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4563 SelectionDAG &DAG) const {
4564 const MachineFunction &MF = DAG.getMachineFunction();
4565 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4566
4567 SDLoc dl(Op);
4568 EVT VT = Op.getValueType();
4569 SDValue Chain = Op.getOperand(i: 0);
4570 Register SPReg = Info->getStackPtrOffsetReg();
4571
4572 // Chain the dynamic stack allocation so that it doesn't modify the stack
4573 // pointer when other instructions are using the stack.
4574 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
4575
4576 SDValue Size = Op.getOperand(i: 1);
4577 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
4578 Align Alignment = cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getAlignValue();
4579
4580 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4581 assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4582 "Stack grows upwards for AMDGPU");
4583
4584 Chain = BaseAddr.getValue(R: 1);
4585 Align StackAlign = TFL->getStackAlign();
4586 if (Alignment > StackAlign) {
4587 uint64_t ScaledAlignment = Alignment.value()
4588 << Subtarget->getWavefrontSizeLog2();
4589 uint64_t StackAlignMask = ScaledAlignment - 1;
4590 SDValue TmpAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr,
4591 N2: DAG.getConstant(Val: StackAlignMask, DL: dl, VT));
4592 BaseAddr = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: TmpAddr,
4593 N2: DAG.getSignedConstant(Val: -ScaledAlignment, DL: dl, VT));
4594 }
4595
4596 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4597 SDValue NewSP;
4598 if (isa<ConstantSDNode>(Val: Size)) {
4599 // For constant sized alloca, scale alloca size by wave-size
4600 SDValue ScaledSize = DAG.getNode(
4601 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4602 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4603 NewSP = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value
4604 } else {
4605 // For dynamic sized alloca, perform wave-wide reduction to get max of
4606 // alloca size(divergent) and then scale it by wave-size
4607 SDValue WaveReduction =
4608 DAG.getTargetConstant(Val: Intrinsic::amdgcn_wave_reduce_umax, DL: dl, VT: MVT::i32);
4609 Size = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: WaveReduction,
4610 N2: Size, N3: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
4611 SDValue ScaledSize = DAG.getNode(
4612 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4613 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4614 NewSP =
4615 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value in vgpr.
4616 SDValue ReadFirstLaneID =
4617 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: dl, VT: MVT::i32);
4618 NewSP = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: ReadFirstLaneID,
4619 N2: NewSP);
4620 }
4621
4622 Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: NewSP); // Output chain
4623 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
4624
4625 return DAG.getMergeValues(Ops: {BaseAddr, CallSeqEnd}, dl);
4626}
4627
4628SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4629 if (Op.getValueType() != MVT::i32)
4630 return Op; // Defer to cannot select error.
4631
4632 Register SP = getStackPointerRegisterToSaveRestore();
4633 SDLoc SL(Op);
4634
4635 SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op->getOperand(Num: 0), dl: SL, Reg: SP, VT: MVT::i32);
4636
4637 // Convert from wave uniform to swizzled vector address. This should protect
4638 // from any edge cases where the stacksave result isn't directly used with
4639 // stackrestore.
4640 SDValue VectorAddress =
4641 DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP);
4642 return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL);
4643}
4644
4645SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4646 SelectionDAG &DAG) const {
4647 SDLoc SL(Op);
4648 assert(Op.getValueType() == MVT::i32);
4649
4650 uint32_t BothRoundHwReg =
4651 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4652 SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4653
4654 SDValue IntrinID =
4655 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4656 SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(),
4657 N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm);
4658
4659 // There are two rounding modes, one for f32 and one for f64/f16. We only
4660 // report in the standard value range if both are the same.
4661 //
4662 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4663 // ties away from zero is not supported, and the other values are rotated by
4664 // 1.
4665 //
4666 // If the two rounding modes are not the same, report a target defined value.
4667
4668 // Mode register rounding mode fields:
4669 //
4670 // [1:0] Single-precision round mode.
4671 // [3:2] Double/Half-precision round mode.
4672 //
4673 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4674 //
4675 // Hardware Spec
4676 // Toward-0 3 0
4677 // Nearest Even 0 1
4678 // +Inf 1 2
4679 // -Inf 2 3
4680 // NearestAway0 N/A 4
4681 //
4682 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4683 // table we can index by the raw hardware mode.
4684 //
4685 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4686
4687 SDValue BitTable =
4688 DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64);
4689
4690 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4691 SDValue RoundModeTimesNumBits =
4692 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two);
4693
4694 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4695 // knew only one mode was demanded.
4696 SDValue TableValue =
4697 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4698 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4699
4700 SDValue EntryMask = DAG.getConstant(Val: 0xf, DL: SL, VT: MVT::i32);
4701 SDValue TableEntry =
4702 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask);
4703
4704 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4705 // if it's an extended value.
4706 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4707 SDValue IsStandardValue =
4708 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT);
4709 SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four);
4710 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue,
4711 N2: TableEntry, N3: EnumOffset);
4712
4713 return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL);
4714}
4715
4716SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4717 SelectionDAG &DAG) const {
4718 SDLoc SL(Op);
4719
4720 SDValue NewMode = Op.getOperand(i: 1);
4721 assert(NewMode.getValueType() == MVT::i32);
4722
4723 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4724 // hardware MODE.fp_round values.
4725 if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) {
4726 uint32_t ClampedVal = std::min(
4727 a: static_cast<uint32_t>(ConstMode->getZExtValue()),
4728 b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4729 NewMode = DAG.getConstant(
4730 Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32);
4731 } else {
4732 // If we know the input can only be one of the supported standard modes in
4733 // the range 0-3, we can use a simplified mapping to hardware values.
4734 KnownBits KB = DAG.computeKnownBits(Op: NewMode);
4735 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4736 // The supported standard values are 0-3. The extended values start at 8. We
4737 // need to offset by 4 if the value is in the extended range.
4738
4739 if (UseReducedTable) {
4740 // Truncate to the low 32-bits.
4741 SDValue BitTable = DAG.getConstant(
4742 Val: AMDGPU::FltRoundToHWConversionTable & 0xffff, DL: SL, VT: MVT::i32);
4743
4744 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4745 SDValue RoundModeTimesNumBits =
4746 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two);
4747
4748 NewMode =
4749 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits);
4750
4751 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4752 // the table extracted bits into inline immediates.
4753 } else {
4754 // table_index = umin(value, value - 4)
4755 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4756 SDValue BitTable =
4757 DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64);
4758
4759 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4760 SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four);
4761 SDValue IndexVal =
4762 DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum);
4763
4764 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4765 SDValue RoundModeTimesNumBits =
4766 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two);
4767
4768 SDValue TableValue =
4769 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4770 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4771
4772 // No need to mask out the high bits since the setreg will ignore them
4773 // anyway.
4774 NewMode = TruncTable;
4775 }
4776
4777 // Insert a readfirstlane in case the value is a VGPR. We could do this
4778 // earlier and keep more operations scalar, but that interferes with
4779 // combining the source.
4780 SDValue ReadFirstLaneID =
4781 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4782 NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4783 N1: ReadFirstLaneID, N2: NewMode);
4784 }
4785
4786 // N.B. The setreg will be later folded into s_round_mode on supported
4787 // targets.
4788 SDValue IntrinID =
4789 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4790 uint32_t BothRoundHwReg =
4791 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4792 SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4793
4794 SDValue SetReg =
4795 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op->getVTList(), N1: Op.getOperand(i: 0),
4796 N2: IntrinID, N3: RoundBothImm, N4: NewMode);
4797
4798 return SetReg;
4799}
4800
4801SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4802 if (Op->isDivergent() &&
4803 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(i: 4)))
4804 // Cannot do I$ prefetch with divergent pointer.
4805 return SDValue();
4806
4807 switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4808 case AMDGPUAS::FLAT_ADDRESS:
4809 case AMDGPUAS::GLOBAL_ADDRESS:
4810 case AMDGPUAS::CONSTANT_ADDRESS:
4811 break;
4812 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4813 if (Subtarget->hasSafeSmemPrefetch())
4814 break;
4815 [[fallthrough]];
4816 default:
4817 return SDValue();
4818 }
4819
4820 // I$ prefetch
4821 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(i: 4))
4822 return SDValue();
4823
4824 return Op;
4825}
4826
4827// Work around DAG legality rules only based on the result type.
4828SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4829 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4830 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
4831 EVT SrcVT = Src.getValueType();
4832
4833 if (SrcVT.getScalarType() != MVT::bf16)
4834 return Op;
4835
4836 SDLoc SL(Op);
4837 SDValue BitCast =
4838 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4839
4840 EVT DstVT = Op.getValueType();
4841 if (IsStrict)
4842 llvm_unreachable("Need STRICT_BF16_TO_FP");
4843
4844 return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4845}
4846
4847SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4848 SDLoc SL(Op);
4849 if (Op.getValueType() != MVT::i64)
4850 return Op;
4851
4852 uint32_t ModeHwReg =
4853 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4854 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4855 uint32_t TrapHwReg =
4856 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4857 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4858
4859 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
4860 SDValue IntrinID =
4861 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4862 SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4863 N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm);
4864 SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4865 N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm);
4866 SDValue TokenReg =
4867 DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: 1),
4868 N2: GetTrapReg.getValue(R: 1));
4869
4870 SDValue CvtPtr =
4871 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg);
4872 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
4873
4874 return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
4875}
4876
4877SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4878 SDLoc SL(Op);
4879 if (Op.getOperand(i: 1).getValueType() != MVT::i64)
4880 return Op;
4881
4882 SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
4883 SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4884 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
4885 SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4886 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
4887
4888 SDValue ReadFirstLaneID =
4889 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4890 NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4891 N1: ReadFirstLaneID, N2: NewModeReg);
4892 NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4893 N1: ReadFirstLaneID, N2: NewTrapReg);
4894
4895 unsigned ModeHwReg =
4896 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4897 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4898 unsigned TrapHwReg =
4899 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4900 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4901
4902 SDValue IntrinID =
4903 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4904 SDValue SetModeReg =
4905 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4906 N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg);
4907 SDValue SetTrapReg =
4908 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4909 N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg);
4910 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg);
4911}
4912
4913Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4914 const MachineFunction &MF) const {
4915 const Function &Fn = MF.getFunction();
4916
4917 Register Reg = StringSwitch<Register>(RegName)
4918 .Case(S: "m0", Value: AMDGPU::M0)
4919 .Case(S: "exec", Value: AMDGPU::EXEC)
4920 .Case(S: "exec_lo", Value: AMDGPU::EXEC_LO)
4921 .Case(S: "exec_hi", Value: AMDGPU::EXEC_HI)
4922 .Case(S: "flat_scratch", Value: AMDGPU::FLAT_SCR)
4923 .Case(S: "flat_scratch_lo", Value: AMDGPU::FLAT_SCR_LO)
4924 .Case(S: "flat_scratch_hi", Value: AMDGPU::FLAT_SCR_HI)
4925 .Default(Value: Register());
4926 if (!Reg)
4927 return Reg;
4928
4929 if (!Subtarget->hasFlatScrRegister() &&
4930 Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) {
4931 Fn.getContext().emitError(ErrorStr: Twine("invalid register \"" + StringRef(RegName) +
4932 "\" for subtarget."));
4933 }
4934
4935 switch (Reg) {
4936 case AMDGPU::M0:
4937 case AMDGPU::EXEC_LO:
4938 case AMDGPU::EXEC_HI:
4939 case AMDGPU::FLAT_SCR_LO:
4940 case AMDGPU::FLAT_SCR_HI:
4941 if (VT.getSizeInBits() == 32)
4942 return Reg;
4943 break;
4944 case AMDGPU::EXEC:
4945 case AMDGPU::FLAT_SCR:
4946 if (VT.getSizeInBits() == 64)
4947 return Reg;
4948 break;
4949 default:
4950 llvm_unreachable("missing register type checking");
4951 }
4952
4953 report_fatal_error(
4954 reason: Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4955}
4956
4957// If kill is not the last instruction, split the block so kill is always a
4958// proper terminator.
4959MachineBasicBlock *
4960SITargetLowering::splitKillBlock(MachineInstr &MI,
4961 MachineBasicBlock *BB) const {
4962 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, /*UpdateLiveIns=*/true);
4963 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4964 MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
4965 return SplitBB;
4966}
4967
4968// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4969// \p MI will be the only instruction in the loop body block. Otherwise, it will
4970// be the first instruction in the remainder block.
4971//
4972/// \returns { LoopBody, Remainder }
4973static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4974splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4975 MachineFunction *MF = MBB.getParent();
4976 MachineBasicBlock::iterator I(&MI);
4977
4978 // To insert the loop we need to split the block. Move everything after this
4979 // point to a new block, and insert a new empty block between the two.
4980 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4981 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4982 MachineFunction::iterator MBBI(MBB);
4983 ++MBBI;
4984
4985 MF->insert(MBBI, MBB: LoopBB);
4986 MF->insert(MBBI, MBB: RemainderBB);
4987
4988 LoopBB->addSuccessor(Succ: LoopBB);
4989 LoopBB->addSuccessor(Succ: RemainderBB);
4990
4991 // Move the rest of the block into a new block.
4992 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
4993
4994 if (InstInLoop) {
4995 auto Next = std::next(x: I);
4996
4997 // Move instruction to loop body.
4998 LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
4999
5000 // Move the rest of the block.
5001 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
5002 } else {
5003 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
5004 }
5005
5006 MBB.addSuccessor(Succ: LoopBB);
5007
5008 return std::pair(LoopBB, RemainderBB);
5009}
5010
5011/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5012void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
5013 MachineBasicBlock *MBB = MI.getParent();
5014 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5015 auto I = MI.getIterator();
5016 auto E = std::next(x: I);
5017
5018 // clang-format off
5019 BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT))
5020 .addImm(Val: 0);
5021 // clang-format on
5022
5023 MIBundleBuilder Bundler(*MBB, I, E);
5024 finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
5025}
5026
5027MachineBasicBlock *
5028SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
5029 MachineBasicBlock *BB) const {
5030 const DebugLoc &DL = MI.getDebugLoc();
5031
5032 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5033
5034 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5035
5036 // Apparently kill flags are only valid if the def is in the same block?
5037 if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0))
5038 Src->setIsKill(false);
5039
5040 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true);
5041
5042 MachineBasicBlock::iterator I = LoopBB->end();
5043
5044 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5045 Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: 1);
5046
5047 // Clear TRAP_STS.MEM_VIOL
5048 BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
5049 .addImm(Val: 0)
5050 .addImm(Val: EncodedReg);
5051
5052 bundleInstWithWaitcnt(MI);
5053
5054 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5055
5056 // Load and check TRAP_STS.MEM_VIOL
5057 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg)
5058 .addImm(Val: EncodedReg);
5059
5060 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5061 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5062 .addReg(RegNo: Reg, Flags: RegState::Kill)
5063 .addImm(Val: 0);
5064 // clang-format off
5065 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5066 .addMBB(MBB: LoopBB);
5067 // clang-format on
5068
5069 return RemainderBB;
5070}
5071
5072// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5073// wavefront. If the value is uniform and just happens to be in a VGPR, this
5074// will only do one iteration. In the worst case, this will loop 64 times.
5075//
5076// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5077static MachineBasicBlock::iterator
5078emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
5079 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5080 const DebugLoc &DL, const MachineOperand &Idx,
5081 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5082 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5083 Register &SGPRIdxReg) {
5084
5085 MachineFunction *MF = OrigBB.getParent();
5086 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5088 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5089 MachineBasicBlock::iterator I = LoopBB.begin();
5090
5091 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5092 Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
5093 Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
5094 Register CurrentIdxReg =
5095 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5096 Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
5097
5098 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
5099 .addReg(RegNo: InitReg)
5100 .addMBB(MBB: &OrigBB)
5101 .addReg(RegNo: ResultReg)
5102 .addMBB(MBB: &LoopBB);
5103
5104 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
5105 .addReg(RegNo: InitSaveExecReg)
5106 .addMBB(MBB: &OrigBB)
5107 .addReg(RegNo: NewExec)
5108 .addMBB(MBB: &LoopBB);
5109
5110 // Read the next variant <- also loop target.
5111 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg)
5112 .addReg(RegNo: Idx.getReg(), Flags: getUndefRegState(B: Idx.isUndef()));
5113
5114 // Compare the just read M0 value to all possible Idx values.
5115 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg)
5116 .addReg(RegNo: CurrentIdxReg)
5117 .addReg(RegNo: Idx.getReg(), Flags: {}, SubReg: Idx.getSubReg());
5118
5119 // Update EXEC, save the original EXEC value to VCC.
5120 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.AndSaveExecOpc), DestReg: NewExec)
5121 .addReg(RegNo: CondReg, Flags: RegState::Kill);
5122
5123 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
5124
5125 if (UseGPRIdxMode) {
5126 if (Offset == 0) {
5127 SGPRIdxReg = CurrentIdxReg;
5128 } else {
5129 SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
5130 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg)
5131 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5132 .addImm(Val: Offset);
5133 }
5134 } else {
5135 // Move index from VCC into M0
5136 if (Offset == 0) {
5137 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5138 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill);
5139 } else {
5140 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5141 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5142 .addImm(Val: Offset);
5143 }
5144 }
5145
5146 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5147 MachineInstr *InsertPt =
5148 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
5149 .addReg(RegNo: LMC.ExecReg)
5150 .addReg(RegNo: NewExec);
5151
5152 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5153 // s_cbranch_scc0?
5154
5155 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5156 // clang-format off
5157 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
5158 .addMBB(MBB: &LoopBB);
5159 // clang-format on
5160
5161 return InsertPt->getIterator();
5162}
5163
5164// This has slightly sub-optimal regalloc when the source vector is killed by
5165// the read. The register allocator does not understand that the kill is
5166// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5167// subregister from it, using 1 more VGPR than necessary. This was saved when
5168// this was expanded after register allocation.
5169static MachineBasicBlock::iterator
5170loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
5171 unsigned InitResultReg, unsigned PhiReg, int Offset,
5172 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5173 MachineFunction *MF = MBB.getParent();
5174 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5175 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5176 MachineRegisterInfo &MRI = MF->getRegInfo();
5177 const DebugLoc &DL = MI.getDebugLoc();
5178 MachineBasicBlock::iterator I(&MI);
5179
5180 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5181 Register DstReg = MI.getOperand(i: 0).getReg();
5182 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5183 Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5184 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5185
5186 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec);
5187
5188 // Save the EXEC mask
5189 // clang-format off
5190 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExec)
5191 .addReg(RegNo: LMC.ExecReg);
5192 // clang-format on
5193
5194 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, InstInLoop: false);
5195
5196 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5197
5198 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx,
5199 InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
5200 Offset, UseGPRIdxMode, SGPRIdxReg);
5201
5202 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5203 MachineFunction::iterator MBBI(LoopBB);
5204 ++MBBI;
5205 MF->insert(MBBI, MBB: LandingPad);
5206 LoopBB->removeSuccessor(Succ: RemainderBB);
5207 LandingPad->addSuccessor(Succ: RemainderBB);
5208 LoopBB->addSuccessor(Succ: LandingPad);
5209 MachineBasicBlock::iterator First = LandingPad->begin();
5210 // clang-format off
5211 BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
5212 .addReg(RegNo: SaveExec);
5213 // clang-format on
5214
5215 return InsPt;
5216}
5217
5218// Returns subreg index, offset
5219static std::pair<unsigned, int>
5220computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
5221 const TargetRegisterClass *SuperRC, unsigned VecReg,
5222 int Offset) {
5223 int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / 32;
5224
5225 // Skip out of bounds offsets, or else we would end up using an undefined
5226 // register.
5227 if (Offset >= NumElts || Offset < 0)
5228 return std::pair(AMDGPU::sub0, Offset);
5229
5230 return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0);
5231}
5232
5233static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
5234 MachineRegisterInfo &MRI, MachineInstr &MI,
5235 int Offset) {
5236 MachineBasicBlock *MBB = MI.getParent();
5237 const DebugLoc &DL = MI.getDebugLoc();
5238 MachineBasicBlock::iterator I(&MI);
5239
5240 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5241
5242 assert(Idx->getReg() != AMDGPU::NoRegister);
5243
5244 if (Offset == 0) {
5245 // clang-format off
5246 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5247 .add(MO: *Idx);
5248 // clang-format on
5249 } else {
5250 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5251 .add(MO: *Idx)
5252 .addImm(Val: Offset);
5253 }
5254}
5255
5256static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
5257 MachineRegisterInfo &MRI, MachineInstr &MI,
5258 int Offset) {
5259 MachineBasicBlock *MBB = MI.getParent();
5260 const DebugLoc &DL = MI.getDebugLoc();
5261 MachineBasicBlock::iterator I(&MI);
5262
5263 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5264
5265 if (Offset == 0)
5266 return Idx->getReg();
5267
5268 Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5269 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp)
5270 .add(MO: *Idx)
5271 .addImm(Val: Offset);
5272 return Tmp;
5273}
5274
5275static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
5276 MachineBasicBlock &MBB,
5277 const GCNSubtarget &ST) {
5278 const SIInstrInfo *TII = ST.getInstrInfo();
5279 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5280 MachineFunction *MF = MBB.getParent();
5281 MachineRegisterInfo &MRI = MF->getRegInfo();
5282
5283 Register Dst = MI.getOperand(i: 0).getReg();
5284 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5285 Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg();
5286 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5287
5288 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
5289 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5290
5291 unsigned SubReg;
5292 std::tie(args&: SubReg, args&: Offset) =
5293 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
5294
5295 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5296
5297 // Check for a SGPR index.
5298 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5299 MachineBasicBlock::iterator I(&MI);
5300 const DebugLoc &DL = MI.getDebugLoc();
5301
5302 if (UseGPRIdxMode) {
5303 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5304 // to avoid interfering with other uses, so probably requires a new
5305 // optimization pass.
5306 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5307
5308 const MCInstrDesc &GPRIDXDesc =
5309 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
5310 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5311 .addReg(RegNo: SrcReg)
5312 .addReg(RegNo: Idx)
5313 .addImm(Val: SubReg);
5314 } else {
5315 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5316
5317 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5318 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
5319 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5320 }
5321
5322 MI.eraseFromParent();
5323
5324 return &MBB;
5325 }
5326
5327 // Control flow needs to be inserted if indexing with a VGPR.
5328 const DebugLoc &DL = MI.getDebugLoc();
5329 MachineBasicBlock::iterator I(&MI);
5330
5331 Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5332 Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5333
5334 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg);
5335
5336 Register SGPRIdxReg;
5337 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
5338 UseGPRIdxMode, SGPRIdxReg);
5339
5340 MachineBasicBlock *LoopBB = InsPt->getParent();
5341
5342 if (UseGPRIdxMode) {
5343 const MCInstrDesc &GPRIDXDesc =
5344 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
5345
5346 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5347 .addReg(RegNo: SrcReg)
5348 .addReg(RegNo: SGPRIdxReg)
5349 .addImm(Val: SubReg);
5350 } else {
5351 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5352 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
5353 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5354 }
5355
5356 MI.eraseFromParent();
5357
5358 return LoopBB;
5359}
5360
5361static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
5362 MachineBasicBlock &MBB,
5363 const GCNSubtarget &ST) {
5364 const SIInstrInfo *TII = ST.getInstrInfo();
5365 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5366 MachineFunction *MF = MBB.getParent();
5367 MachineRegisterInfo &MRI = MF->getRegInfo();
5368
5369 Register Dst = MI.getOperand(i: 0).getReg();
5370 const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src);
5371 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5372 const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val);
5373 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5374 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
5375 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5376
5377 // This can be an immediate, but will be folded later.
5378 assert(Val->getReg());
5379
5380 unsigned SubReg;
5381 std::tie(args&: SubReg, args&: Offset) =
5382 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcVec->getReg(), Offset);
5383 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5384
5385 if (Idx->getReg() == AMDGPU::NoRegister) {
5386 MachineBasicBlock::iterator I(&MI);
5387 const DebugLoc &DL = MI.getDebugLoc();
5388
5389 assert(Offset == 0);
5390
5391 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst)
5392 .add(MO: *SrcVec)
5393 .add(MO: *Val)
5394 .addImm(Val: SubReg);
5395
5396 MI.eraseFromParent();
5397 return &MBB;
5398 }
5399
5400 // Check for a SGPR index.
5401 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5402 MachineBasicBlock::iterator I(&MI);
5403 const DebugLoc &DL = MI.getDebugLoc();
5404
5405 if (UseGPRIdxMode) {
5406 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5407
5408 const MCInstrDesc &GPRIDXDesc =
5409 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5410 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5411 .addReg(RegNo: SrcVec->getReg())
5412 .add(MO: *Val)
5413 .addReg(RegNo: Idx)
5414 .addImm(Val: SubReg);
5415 } else {
5416 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5417
5418 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5419 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5420 BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5421 .addReg(RegNo: SrcVec->getReg())
5422 .add(MO: *Val)
5423 .addImm(Val: SubReg);
5424 }
5425 MI.eraseFromParent();
5426 return &MBB;
5427 }
5428
5429 // Control flow needs to be inserted if indexing with a VGPR.
5430 if (Val->isReg())
5431 MRI.clearKillFlags(Reg: Val->getReg());
5432
5433 const DebugLoc &DL = MI.getDebugLoc();
5434
5435 Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
5436
5437 Register SGPRIdxReg;
5438 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
5439 UseGPRIdxMode, SGPRIdxReg);
5440 MachineBasicBlock *LoopBB = InsPt->getParent();
5441
5442 if (UseGPRIdxMode) {
5443 const MCInstrDesc &GPRIDXDesc =
5444 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5445
5446 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5447 .addReg(RegNo: PhiReg)
5448 .add(MO: *Val)
5449 .addReg(RegNo: SGPRIdxReg)
5450 .addImm(Val: SubReg);
5451 } else {
5452 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5453 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5454 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5455 .addReg(RegNo: PhiReg)
5456 .add(MO: *Val)
5457 .addImm(Val: SubReg);
5458 }
5459
5460 MI.eraseFromParent();
5461 return LoopBB;
5462}
5463
5464static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5465 MachineBasicBlock *BB) {
5466 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5467 // For GFX12, we emit s_add_u64 and s_sub_u64.
5468 MachineFunction *MF = BB->getParent();
5469 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5470 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5471 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5472 const DebugLoc &DL = MI.getDebugLoc();
5473 MachineOperand &Dest = MI.getOperand(i: 0);
5474 MachineOperand &Src0 = MI.getOperand(i: 1);
5475 MachineOperand &Src1 = MI.getOperand(i: 2);
5476 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5477 if (ST.hasScalarAddSub64()) {
5478 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5479 // clang-format off
5480 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5481 .add(MO: Src0)
5482 .add(MO: Src1);
5483 // clang-format on
5484 } else {
5485 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5486 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5487
5488 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5489 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5490
5491 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5492 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5493 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5494 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5495
5496 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5497 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5498 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5499 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5500
5501 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5502 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5503 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0).add(MO: Src0Sub0).add(MO: Src1Sub0);
5504 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1).add(MO: Src0Sub1).add(MO: Src1Sub1);
5505 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5506 .addReg(RegNo: DestSub0)
5507 .addImm(Val: AMDGPU::sub0)
5508 .addReg(RegNo: DestSub1)
5509 .addImm(Val: AMDGPU::sub1);
5510 }
5511 MI.eraseFromParent();
5512 return BB;
5513}
5514
5515static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
5516 switch (Opc) {
5517 case AMDGPU::S_MIN_U32:
5518 return std::numeric_limits<uint32_t>::max();
5519 case AMDGPU::S_MIN_I32:
5520 return std::numeric_limits<int32_t>::max();
5521 case AMDGPU::S_MAX_U32:
5522 return std::numeric_limits<uint32_t>::min();
5523 case AMDGPU::S_MAX_I32:
5524 return std::numeric_limits<int32_t>::min();
5525 case AMDGPU::V_ADD_F32_e64: // -0.0
5526 return 0x80000000;
5527 case AMDGPU::V_SUB_F32_e64: // +0.0
5528 return 0x0;
5529 case AMDGPU::S_ADD_I32:
5530 case AMDGPU::S_SUB_I32:
5531 case AMDGPU::S_OR_B32:
5532 case AMDGPU::S_XOR_B32:
5533 return std::numeric_limits<uint32_t>::min();
5534 case AMDGPU::S_AND_B32:
5535 return std::numeric_limits<uint32_t>::max();
5536 case AMDGPU::V_MIN_F32_e64:
5537 case AMDGPU::V_MAX_F32_e64:
5538 return 0x7fc00000; // qNAN
5539 default:
5540 llvm_unreachable(
5541 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5542 }
5543}
5544
5545static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
5546 switch (Opc) {
5547 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5548 return std::numeric_limits<uint64_t>::max();
5549 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5550 return std::numeric_limits<int64_t>::max();
5551 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5552 return std::numeric_limits<uint64_t>::min();
5553 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5554 return std::numeric_limits<int64_t>::min();
5555 case AMDGPU::V_MIN_F64_e64:
5556 case AMDGPU::V_MAX_F64_e64:
5557 case AMDGPU::V_MIN_NUM_F64_e64:
5558 case AMDGPU::V_MAX_NUM_F64_e64:
5559 return 0x7FF8000000000000; // qNAN
5560 case AMDGPU::S_ADD_U64_PSEUDO:
5561 case AMDGPU::S_SUB_U64_PSEUDO:
5562 case AMDGPU::S_OR_B64:
5563 case AMDGPU::S_XOR_B64:
5564 return std::numeric_limits<uint64_t>::min();
5565 case AMDGPU::S_AND_B64:
5566 return std::numeric_limits<uint64_t>::max();
5567 case AMDGPU::V_ADD_F64_e64:
5568 case AMDGPU::V_ADD_F64_pseudo_e64:
5569 return 0x8000000000000000; // -0.0
5570 default:
5571 llvm_unreachable(
5572 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5573 }
5574}
5575
5576static bool is32bitWaveReduceOperation(unsigned Opc) {
5577 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5578 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5579 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5580 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5581 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5582 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5583 Opc == AMDGPU::V_SUB_F32_e64;
5584}
5585
5586static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5587 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5588 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5589 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5590 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5591 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5592}
5593
5594static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5595 MachineBasicBlock &BB,
5596 const GCNSubtarget &ST,
5597 unsigned Opc) {
5598 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5599 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5600 const DebugLoc &DL = MI.getDebugLoc();
5601 const SIInstrInfo *TII = ST.getInstrInfo();
5602
5603 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5604 Register SrcReg = MI.getOperand(i: 1).getReg();
5605 bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
5606 Register DstReg = MI.getOperand(i: 0).getReg();
5607 MachineBasicBlock *RetBB = nullptr;
5608 if (isSGPR) {
5609 switch (Opc) {
5610 case AMDGPU::S_MIN_U32:
5611 case AMDGPU::S_MIN_I32:
5612 case AMDGPU::V_MIN_F32_e64:
5613 case AMDGPU::S_MAX_U32:
5614 case AMDGPU::S_MAX_I32:
5615 case AMDGPU::V_MAX_F32_e64:
5616 case AMDGPU::S_AND_B32:
5617 case AMDGPU::S_OR_B32: {
5618 // Idempotent operations.
5619 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg);
5620 RetBB = &BB;
5621 break;
5622 }
5623 case AMDGPU::V_CMP_LT_U64_e64: // umin
5624 case AMDGPU::V_CMP_LT_I64_e64: // min
5625 case AMDGPU::V_CMP_GT_U64_e64: // umax
5626 case AMDGPU::V_CMP_GT_I64_e64: // max
5627 case AMDGPU::V_MIN_F64_e64:
5628 case AMDGPU::V_MIN_NUM_F64_e64:
5629 case AMDGPU::V_MAX_F64_e64:
5630 case AMDGPU::V_MAX_NUM_F64_e64:
5631 case AMDGPU::S_AND_B64:
5632 case AMDGPU::S_OR_B64: {
5633 // Idempotent operations.
5634 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg).addReg(RegNo: SrcReg);
5635 RetBB = &BB;
5636 break;
5637 }
5638 case AMDGPU::S_XOR_B32:
5639 case AMDGPU::S_XOR_B64:
5640 case AMDGPU::S_ADD_I32:
5641 case AMDGPU::S_ADD_U64_PSEUDO:
5642 case AMDGPU::V_ADD_F32_e64:
5643 case AMDGPU::V_ADD_F64_e64:
5644 case AMDGPU::V_ADD_F64_pseudo_e64:
5645 case AMDGPU::S_SUB_I32:
5646 case AMDGPU::S_SUB_U64_PSEUDO:
5647 case AMDGPU::V_SUB_F32_e64: {
5648 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5649 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5650 Register ExecMask = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5651 Register NumActiveLanes =
5652 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5653
5654 bool IsWave32 = ST.isWave32();
5655 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5656 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5657 unsigned BitCountOpc =
5658 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5659
5660 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: ExecMask).addReg(RegNo: ExecReg);
5661
5662 auto NewAccumulator =
5663 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: BitCountOpc), DestReg: NumActiveLanes)
5664 .addReg(RegNo: ExecMask);
5665
5666 switch (Opc) {
5667 case AMDGPU::S_XOR_B32:
5668 case AMDGPU::S_XOR_B64: {
5669 // Performing an XOR operation on a uniform value
5670 // depends on the parity of the number of active lanes.
5671 // For even parity, the result will be 0, for odd
5672 // parity the result will be the same as the input value.
5673 Register ParityRegister =
5674 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5675
5676 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: ParityRegister)
5677 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5678 .addImm(Val: 1)
5679 .setOperandDead(3); // Dead scc
5680 if (Opc == AMDGPU::S_XOR_B32) {
5681 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5682 .addReg(RegNo: SrcReg)
5683 .addReg(RegNo: ParityRegister);
5684 } else {
5685 Register DestSub0 =
5686 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5687 Register DestSub1 =
5688 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5689
5690 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
5691 const TargetRegisterClass *SrcSubRC =
5692 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5693
5694 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5695 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
5696 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5697 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
5698
5699 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5700 .add(MO: Op1L)
5701 .addReg(RegNo: ParityRegister);
5702
5703 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub1)
5704 .add(MO: Op1H)
5705 .addReg(RegNo: ParityRegister);
5706
5707 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5708 .addReg(RegNo: DestSub0)
5709 .addImm(Val: AMDGPU::sub0)
5710 .addReg(RegNo: DestSub1)
5711 .addImm(Val: AMDGPU::sub1);
5712 }
5713 break;
5714 }
5715 case AMDGPU::S_SUB_I32: {
5716 Register NegatedVal = MRI.createVirtualRegister(RegClass: DstRegClass);
5717
5718 // Take the negation of the source operand.
5719 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedVal)
5720 .addImm(Val: 0)
5721 .addReg(RegNo: SrcReg);
5722 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5723 .addReg(RegNo: NegatedVal)
5724 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
5725 break;
5726 }
5727 case AMDGPU::S_ADD_I32: {
5728 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5729 .addReg(RegNo: SrcReg)
5730 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
5731 break;
5732 }
5733 case AMDGPU::S_ADD_U64_PSEUDO:
5734 case AMDGPU::S_SUB_U64_PSEUDO: {
5735 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5736 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5737 Register Op1H_Op0L_Reg =
5738 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5739 Register Op1L_Op0H_Reg =
5740 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5741 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5742 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5743 Register NegatedValLo =
5744 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5745 Register NegatedValHi =
5746 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5747
5748 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: SrcReg);
5749 const TargetRegisterClass *Src1SubRC =
5750 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5751
5752 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5753 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5754 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5755 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5756
5757 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5758 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedValLo)
5759 .addImm(Val: 0)
5760 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5761 .setOperandDead(3); // Dead scc
5762 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ASHR_I32), DestReg: NegatedValHi)
5763 .addReg(RegNo: NegatedValLo)
5764 .addImm(Val: 31)
5765 .setOperandDead(3); // Dead scc
5766 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1L_Op0H_Reg)
5767 .add(MO: Op1L)
5768 .addReg(RegNo: NegatedValHi);
5769 }
5770 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5771 ? NegatedValLo
5772 : NewAccumulator->getOperand(i: 0).getReg();
5773 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5774 .add(MO: Op1L)
5775 .addReg(RegNo: LowOpcode);
5776 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_HI_U32), DestReg: CarryReg)
5777 .add(MO: Op1L)
5778 .addReg(RegNo: LowOpcode);
5779 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1H_Op0L_Reg)
5780 .add(MO: Op1H)
5781 .addReg(RegNo: LowOpcode);
5782
5783 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5784 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: HiVal)
5785 .addReg(RegNo: CarryReg)
5786 .addReg(RegNo: Op1H_Op0L_Reg)
5787 .setOperandDead(3); // Dead scc
5788
5789 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5790 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: DestSub1)
5791 .addReg(RegNo: HiVal)
5792 .addReg(RegNo: Op1L_Op0H_Reg)
5793 .setOperandDead(3); // Dead scc
5794 }
5795 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5796 .addReg(RegNo: DestSub0)
5797 .addImm(Val: AMDGPU::sub0)
5798 .addReg(RegNo: DestSub1)
5799 .addImm(Val: AMDGPU::sub1);
5800 break;
5801 }
5802 case AMDGPU::V_ADD_F32_e64:
5803 case AMDGPU::V_ADD_F64_e64:
5804 case AMDGPU::V_ADD_F64_pseudo_e64:
5805 case AMDGPU::V_SUB_F32_e64: {
5806 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5807 const TargetRegisterClass *VregRC = TII->getRegClass(MCID: TII->get(Opcode: Opc), OpNum: 0);
5808 Register ActiveLanesVreg = MRI.createVirtualRegister(RegClass: VregRC);
5809 Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
5810 // Get number of active lanes as a float val.
5811 BuildMI(BB, I&: MI, MIMD: DL,
5812 MCID: TII->get(Opcode: is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5813 : AMDGPU::V_CVT_F64_I32_e64),
5814 DestReg: ActiveLanesVreg)
5815 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5816 .addImm(Val: 0) // clamp
5817 .addImm(Val: 0); // output-modifier
5818
5819 // Take negation of input for SUB reduction
5820 unsigned srcMod =
5821 (Opc == AMDGPU::V_SUB_F32_e64 ||
5822 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5823 ? SISrcMods::NEG
5824 : SISrcMods::NONE;
5825 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5826 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
5827 ? AMDGPU::V_MUL_F64_pseudo_e64
5828 : AMDGPU::V_MUL_F64_e64;
5829 auto DestVregInst = BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MulOpc),
5830 DestReg: DstVreg)
5831 .addImm(Val: srcMod) // src0 modifier
5832 .addReg(RegNo: SrcReg)
5833 .addImm(Val: SISrcMods::NONE) // src1 modifier
5834 .addReg(RegNo: ActiveLanesVreg)
5835 .addImm(Val: SISrcMods::NONE) // clamp
5836 .addImm(Val: SISrcMods::NONE); // output-mod
5837 if (is32BitOpc) {
5838 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
5839 .addReg(RegNo: DstVreg);
5840 } else {
5841 Register LaneValueLoReg =
5842 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5843 Register LaneValueHiReg =
5844 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5845 const TargetRegisterClass *VregSubRC =
5846 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5847 MachineOperand Op1L =
5848 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: DestVregInst->getOperand(i: 0),
5849 SuperRC: VregRC, SubIdx: AMDGPU::sub0, SubRC: VregSubRC);
5850 MachineOperand Op1H =
5851 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: DestVregInst->getOperand(i: 0),
5852 SuperRC: VregRC, SubIdx: AMDGPU::sub1, SubRC: VregSubRC);
5853 // lane value input should be in an sgpr
5854 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
5855 DestReg: LaneValueLoReg)
5856 .add(MO: Op1L);
5857 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
5858 DestReg: LaneValueHiReg)
5859 .add(MO: Op1H);
5860 NewAccumulator =
5861 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5862 .addReg(RegNo: LaneValueLoReg)
5863 .addImm(Val: AMDGPU::sub0)
5864 .addReg(RegNo: LaneValueHiReg)
5865 .addImm(Val: AMDGPU::sub1);
5866 }
5867 }
5868 }
5869 RetBB = &BB;
5870 }
5871 }
5872 } else {
5873 // TODO: Implement DPP Strategy and switch based on immediate strategy
5874 // operand. For now, for all the cases (default, Iterative and DPP we use
5875 // iterative approach by default.)
5876
5877 // To reduce the VGPR using iterative approach, we need to iterate
5878 // over all the active lanes. Lowering consists of ComputeLoop,
5879 // which iterate over only active lanes. We use copy of EXEC register
5880 // as induction variable and every active lane modifies it using bitset0
5881 // so that we will get the next active lane for next iteration.
5882 MachineBasicBlock::iterator I = BB.end();
5883 Register SrcReg = MI.getOperand(i: 1).getReg();
5884 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5885 bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
5886
5887 // Create Control flow for loop
5888 // Split MI's Machine Basic block into For loop
5889 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
5890
5891 // Create virtual registers required for lowering.
5892 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5893 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5894 Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5895 Register IdentityValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5896 Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5897 Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5898 Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5899 Register FF1Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5900 Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5901
5902 bool IsWave32 = ST.isWave32();
5903 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5904 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5905
5906 // Create initial values of induction variable from Exec, Accumulator and
5907 // insert branch instr to newly created ComputeBlock
5908 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpcForExec), DestReg: LoopIterator).addReg(RegNo: ExecReg);
5909 if (is32BitOpc) {
5910 uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
5911 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: IdentityValReg)
5912 .addImm(Val: IdentityValue);
5913 } else {
5914 uint64_t IdentityValue =
5915 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5916 ? 0x0 // +0.0 for double sub reduction
5917 : getIdentityValueFor64BitWaveReduction(Opc);
5918 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO), DestReg: IdentityValReg)
5919 .addImm(Val: IdentityValue);
5920 }
5921 // clang-format off
5922 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
5923 .addMBB(MBB: ComputeLoop);
5924 // clang-format on
5925
5926 // Start constructing ComputeLoop
5927 I = ComputeLoop->begin();
5928 auto Accumulator =
5929 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg)
5930 .addReg(RegNo: IdentityValReg)
5931 .addMBB(MBB: &BB);
5932 auto ActiveBits =
5933 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg)
5934 .addReg(RegNo: LoopIterator)
5935 .addMBB(MBB: &BB);
5936
5937 I = ComputeLoop->end();
5938 MachineInstr *NewAccumulator;
5939 // Perform the computations
5940 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5941 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg)
5942 .addReg(RegNo: ActiveBitsReg);
5943 if (is32BitOpc) {
5944 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
5945 DestReg: LaneValueReg)
5946 .addReg(RegNo: SrcReg)
5947 .addReg(RegNo: FF1Reg);
5948 if (isFPOp) {
5949 Register LaneValVreg =
5950 MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: SrcReg));
5951 Register DstVreg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: SrcReg));
5952 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5953 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32),
5954 DestReg: LaneValVreg)
5955 .addReg(RegNo: LaneValueReg);
5956 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
5957 .addImm(Val: 0) // src0 modifier
5958 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
5959 .addImm(Val: 0) // src1 modifier
5960 .addReg(RegNo: LaneValVreg)
5961 .addImm(Val: 0) // clamp
5962 .addImm(Val: 0); // omod
5963 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
5964 MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
5965 .addReg(RegNo: DstVreg);
5966 } else {
5967 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
5968 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
5969 .addReg(RegNo: LaneValueReg);
5970 }
5971 } else {
5972 Register LaneValueLoReg =
5973 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5974 Register LaneValueHiReg =
5975 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5976 Register LaneValReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
5977 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
5978 const TargetRegisterClass *SrcSubRC =
5979 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5980 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5981 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
5982 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5983 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
5984 // lane value input should be in an sgpr
5985 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
5986 DestReg: LaneValueLoReg)
5987 .add(MO: Op1L)
5988 .addReg(RegNo: FF1Reg);
5989 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
5990 DestReg: LaneValueHiReg)
5991 .add(MO: Op1H)
5992 .addReg(RegNo: FF1Reg);
5993 auto LaneValue = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
5994 MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: LaneValReg)
5995 .addReg(RegNo: LaneValueLoReg)
5996 .addImm(Val: AMDGPU::sub0)
5997 .addReg(RegNo: LaneValueHiReg)
5998 .addImm(Val: AMDGPU::sub1);
5999 switch (Opc) {
6000 case AMDGPU::S_OR_B64:
6001 case AMDGPU::S_AND_B64:
6002 case AMDGPU::S_XOR_B64: {
6003 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6004 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6005 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6006 .setOperandDead(3); // Dead scc
6007 break;
6008 }
6009 case AMDGPU::V_CMP_GT_I64_e64:
6010 case AMDGPU::V_CMP_GT_U64_e64:
6011 case AMDGPU::V_CMP_LT_I64_e64:
6012 case AMDGPU::V_CMP_LT_U64_e64: {
6013 Register LaneMaskReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6014 Register ComparisonResultReg =
6015 MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6016 int SrcIdx =
6017 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6018 const TargetRegisterClass *VregClass =
6019 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6020 const TargetRegisterClass *VSubRegClass =
6021 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6022 Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregClass);
6023 MachineOperand SrcReg0Sub0 =
6024 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: Accumulator->getOperand(i: 0),
6025 SuperRC: VregClass, SubIdx: AMDGPU::sub0, SubRC: VSubRegClass);
6026 MachineOperand SrcReg0Sub1 =
6027 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: Accumulator->getOperand(i: 0),
6028 SuperRC: VregClass, SubIdx: AMDGPU::sub1, SubRC: VSubRegClass);
6029 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE),
6030 DestReg: AccumulatorVReg)
6031 .add(MO: SrcReg0Sub0)
6032 .addImm(Val: AMDGPU::sub0)
6033 .add(MO: SrcReg0Sub1)
6034 .addImm(Val: AMDGPU::sub1);
6035 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: LaneMaskReg)
6036 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6037 .addReg(RegNo: AccumulatorVReg);
6038
6039 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6040 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: ComparisonResultReg)
6041 .addReg(RegNo: LaneMaskReg)
6042 .addReg(RegNo: ActiveBitsReg);
6043
6044 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6045 MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
6046 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6047 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg());
6048 break;
6049 }
6050 case AMDGPU::V_MIN_F64_e64:
6051 case AMDGPU::V_MIN_NUM_F64_e64:
6052 case AMDGPU::V_MAX_F64_e64:
6053 case AMDGPU::V_MAX_NUM_F64_e64:
6054 case AMDGPU::V_ADD_F64_e64:
6055 case AMDGPU::V_ADD_F64_pseudo_e64: {
6056 int SrcIdx =
6057 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6058 const TargetRegisterClass *VregRC =
6059 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6060 const TargetRegisterClass *VregSubRC =
6061 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6062 Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregRC);
6063 Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
6064 Register LaneValLo =
6065 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6066 Register LaneValHi =
6067 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6068 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AccumulatorVReg)
6069 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg());
6070 unsigned Modifier =
6071 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6072 ? SISrcMods::NEG
6073 : SISrcMods::NONE;
6074 auto DstVregInst = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
6075 .addImm(Val: Modifier) // src0 modifiers
6076 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6077 .addImm(Val: SISrcMods::NONE) // src1 modifiers
6078 .addReg(RegNo: AccumulatorVReg)
6079 .addImm(Val: SISrcMods::NONE) // clamp
6080 .addImm(Val: SISrcMods::NONE); // omod
6081 auto ReadLaneLo =
6082 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6083 DestReg: LaneValLo);
6084 auto ReadLaneHi =
6085 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6086 DestReg: LaneValHi);
6087 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6088 MachineOperand Op1L =
6089 TII->buildExtractSubRegOrImm(MI: Iters, MRI, SuperReg: DstVregInst->getOperand(i: 0),
6090 SuperRC: VregRC, SubIdx: AMDGPU::sub0, SubRC: VregSubRC);
6091 MachineOperand Op1H =
6092 TII->buildExtractSubRegOrImm(MI: Iters, MRI, SuperReg: DstVregInst->getOperand(i: 0),
6093 SuperRC: VregRC, SubIdx: AMDGPU::sub1, SubRC: VregSubRC);
6094 ReadLaneLo.add(MO: Op1L);
6095 ReadLaneHi.add(MO: Op1H);
6096 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6097 MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
6098 .addReg(RegNo: LaneValLo)
6099 .addImm(Val: AMDGPU::sub0)
6100 .addReg(RegNo: LaneValHi)
6101 .addImm(Val: AMDGPU::sub1);
6102 break;
6103 }
6104 case AMDGPU::S_ADD_U64_PSEUDO:
6105 case AMDGPU::S_SUB_U64_PSEUDO: {
6106 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6107 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6108 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg());
6109 ComputeLoop = Expand64BitScalarArithmetic(MI&: *NewAccumulator, BB: ComputeLoop);
6110 break;
6111 }
6112 }
6113 }
6114 // Manipulate the iterator to get the next active lane
6115 unsigned BITSETOpc =
6116 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6117 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg)
6118 .addReg(RegNo: FF1Reg)
6119 .addReg(RegNo: ActiveBitsReg);
6120
6121 // Add phi nodes
6122 Accumulator.addReg(RegNo: DstReg).addMBB(MBB: ComputeLoop);
6123 ActiveBits.addReg(RegNo: NewActiveBitsReg).addMBB(MBB: ComputeLoop);
6124
6125 // Creating branching
6126 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6127 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc))
6128 .addReg(RegNo: NewActiveBitsReg)
6129 .addImm(Val: 0);
6130 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6131 .addMBB(MBB: ComputeLoop);
6132
6133 RetBB = ComputeEnd;
6134 }
6135 MI.eraseFromParent();
6136 return RetBB;
6137}
6138
6139MachineBasicBlock *
6140SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6141 MachineBasicBlock *BB) const {
6142 MachineFunction *MF = BB->getParent();
6143 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
6144 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6145 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6146 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6147 MachineRegisterInfo &MRI = MF->getRegInfo();
6148 const DebugLoc &DL = MI.getDebugLoc();
6149
6150 switch (MI.getOpcode()) {
6151 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6152 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_U32);
6153 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6154 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_LT_U64_e64);
6155 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6156 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_I32);
6157 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6158 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_LT_I64_e64);
6159 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6160 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_MIN_F32_e64);
6161 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6162 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6163 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6164 ? AMDGPU::V_MIN_NUM_F64_e64
6165 : AMDGPU::V_MIN_F64_e64);
6166 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6167 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_U32);
6168 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6169 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_GT_U64_e64);
6170 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6171 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_I32);
6172 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6173 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_GT_I64_e64);
6174 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6175 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_MAX_F32_e64);
6176 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6177 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6178 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6179 ? AMDGPU::V_MAX_NUM_F64_e64
6180 : AMDGPU::V_MAX_F64_e64);
6181 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6182 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_I32);
6183 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6184 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_U64_PSEUDO);
6185 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6186 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_ADD_F32_e64);
6187 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6188 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6189 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6190 ? AMDGPU::V_ADD_F64_pseudo_e64
6191 : AMDGPU::V_ADD_F64_e64);
6192 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6193 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_I32);
6194 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6195 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_U64_PSEUDO);
6196 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6197 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_SUB_F32_e64);
6198 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6199 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6200 // fadd + neg, by setting the NEG bit in the instruction.
6201 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6202 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6203 ? AMDGPU::V_ADD_F64_pseudo_e64
6204 : AMDGPU::V_ADD_F64_e64);
6205 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6206 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B32);
6207 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6208 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B64);
6209 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6210 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B32);
6211 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6212 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B64);
6213 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6214 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B32);
6215 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6216 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B64);
6217 case AMDGPU::S_UADDO_PSEUDO:
6218 case AMDGPU::S_USUBO_PSEUDO: {
6219 MachineOperand &Dest0 = MI.getOperand(i: 0);
6220 MachineOperand &Dest1 = MI.getOperand(i: 1);
6221 MachineOperand &Src0 = MI.getOperand(i: 2);
6222 MachineOperand &Src1 = MI.getOperand(i: 3);
6223
6224 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6225 ? AMDGPU::S_ADD_U32
6226 : AMDGPU::S_SUB_U32;
6227 // clang-format off
6228 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg())
6229 .add(MO: Src0)
6230 .add(MO: Src1);
6231 // clang-format on
6232
6233 unsigned SelOpc =
6234 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6235 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: Dest1.getReg()).addImm(Val: -1).addImm(Val: 0);
6236
6237 MI.eraseFromParent();
6238 return BB;
6239 }
6240 case AMDGPU::S_ADD_U64_PSEUDO:
6241 case AMDGPU::S_SUB_U64_PSEUDO: {
6242 return Expand64BitScalarArithmetic(MI, BB);
6243 }
6244 case AMDGPU::V_ADD_U64_PSEUDO:
6245 case AMDGPU::V_SUB_U64_PSEUDO: {
6246 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6247
6248 MachineOperand &Dest = MI.getOperand(i: 0);
6249 MachineOperand &Src0 = MI.getOperand(i: 1);
6250 MachineOperand &Src1 = MI.getOperand(i: 2);
6251
6252 if (ST.hasAddSubU64Insts()) {
6253 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL,
6254 MCID: TII->get(Opcode: IsAdd ? AMDGPU::V_ADD_U64_e64
6255 : AMDGPU::V_SUB_U64_e64),
6256 DestReg: Dest.getReg())
6257 .add(MO: Src0)
6258 .add(MO: Src1)
6259 .addImm(Val: 0); // clamp
6260 TII->legalizeOperands(MI&: *I);
6261 MI.eraseFromParent();
6262 return BB;
6263 }
6264
6265 if (IsAdd && ST.hasLshlAddU64Inst()) {
6266 auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64),
6267 DestReg: Dest.getReg())
6268 .add(MO: Src0)
6269 .addImm(Val: 0)
6270 .add(MO: Src1);
6271 TII->legalizeOperands(MI&: *Add);
6272 MI.eraseFromParent();
6273 return BB;
6274 }
6275
6276 const auto *CarryRC = TRI->getWaveMaskRegClass();
6277
6278 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6279 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6280
6281 Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6282 Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6283
6284 const TargetRegisterClass *Src0RC = Src0.isReg()
6285 ? MRI.getRegClass(Reg: Src0.getReg())
6286 : &AMDGPU::VReg_64RegClass;
6287 const TargetRegisterClass *Src1RC = Src1.isReg()
6288 ? MRI.getRegClass(Reg: Src1.getReg())
6289 : &AMDGPU::VReg_64RegClass;
6290
6291 const TargetRegisterClass *Src0SubRC =
6292 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6293 const TargetRegisterClass *Src1SubRC =
6294 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6295
6296 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6297 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6298 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6299 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6300
6301 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6302 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6303 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6304 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6305
6306 unsigned LoOpc =
6307 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6308 MachineInstr *LoHalf = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
6309 .addReg(RegNo: CarryReg, Flags: RegState::Define)
6310 .add(MO: SrcReg0Sub0)
6311 .add(MO: SrcReg1Sub0)
6312 .addImm(Val: 0); // clamp bit
6313
6314 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6315 MachineInstr *HiHalf =
6316 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
6317 .addReg(RegNo: DeadCarryReg, Flags: RegState::Define | RegState::Dead)
6318 .add(MO: SrcReg0Sub1)
6319 .add(MO: SrcReg1Sub1)
6320 .addReg(RegNo: CarryReg, Flags: RegState::Kill)
6321 .addImm(Val: 0); // clamp bit
6322
6323 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
6324 .addReg(RegNo: DestSub0)
6325 .addImm(Val: AMDGPU::sub0)
6326 .addReg(RegNo: DestSub1)
6327 .addImm(Val: AMDGPU::sub1);
6328 TII->legalizeOperands(MI&: *LoHalf);
6329 TII->legalizeOperands(MI&: *HiHalf);
6330 MI.eraseFromParent();
6331 return BB;
6332 }
6333 case AMDGPU::S_ADD_CO_PSEUDO:
6334 case AMDGPU::S_SUB_CO_PSEUDO: {
6335 // This pseudo has a chance to be selected
6336 // only from uniform add/subcarry node. All the VGPR operands
6337 // therefore assumed to be splat vectors.
6338 MachineBasicBlock::iterator MII = MI;
6339 MachineOperand &Dest = MI.getOperand(i: 0);
6340 MachineOperand &CarryDest = MI.getOperand(i: 1);
6341 MachineOperand &Src0 = MI.getOperand(i: 2);
6342 MachineOperand &Src1 = MI.getOperand(i: 3);
6343 MachineOperand &Src2 = MI.getOperand(i: 4);
6344 if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
6345 Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6346 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0)
6347 .addReg(RegNo: Src0.getReg());
6348 Src0.setReg(RegOp0);
6349 }
6350 if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
6351 Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6352 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1)
6353 .addReg(RegNo: Src1.getReg());
6354 Src1.setReg(RegOp1);
6355 }
6356 Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6357 if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
6358 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2)
6359 .addReg(RegNo: Src2.getReg());
6360 Src2.setReg(RegOp2);
6361 }
6362
6363 if (ST.isWave64()) {
6364 if (ST.hasScalarCompareEq64()) {
6365 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64))
6366 .addReg(RegNo: Src2.getReg())
6367 .addImm(Val: 0);
6368 } else {
6369 const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
6370 const TargetRegisterClass *SubRC =
6371 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6372 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6373 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC);
6374 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6375 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC);
6376 Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6377
6378 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32)
6379 .add(MO: Src2Sub0)
6380 .add(MO: Src2Sub1);
6381
6382 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
6383 .addReg(RegNo: Src2_32, Flags: RegState::Kill)
6384 .addImm(Val: 0);
6385 }
6386 } else {
6387 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
6388 .addReg(RegNo: Src2.getReg())
6389 .addImm(Val: 0);
6390 }
6391
6392 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6393 ? AMDGPU::S_ADDC_U32
6394 : AMDGPU::S_SUBB_U32;
6395
6396 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg()).add(MO: Src0).add(MO: Src1);
6397
6398 unsigned SelOpc =
6399 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6400
6401 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg())
6402 .addImm(Val: -1)
6403 .addImm(Val: 0);
6404
6405 MI.eraseFromParent();
6406 return BB;
6407 }
6408 case AMDGPU::SI_INIT_M0: {
6409 MachineOperand &M0Init = MI.getOperand(i: 0);
6410 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
6411 MCID: TII->get(Opcode: M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6412 DestReg: AMDGPU::M0)
6413 .add(MO: M0Init);
6414 MI.eraseFromParent();
6415 return BB;
6416 }
6417 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6418 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6419 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
6420 MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
6421 .addImm(Val: 0)
6422 .addImm(Val: 0);
6423 return BB;
6424 }
6425 case AMDGPU::GET_GROUPSTATICSIZE: {
6426 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6427 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6428 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32))
6429 .add(MO: MI.getOperand(i: 0))
6430 .addImm(Val: MFI->getLDSSize());
6431 MI.eraseFromParent();
6432 return BB;
6433 }
6434 case AMDGPU::GET_SHADERCYCLESHILO: {
6435 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6436 // The algorithm is:
6437 //
6438 // hi1 = getreg(SHADER_CYCLES_HI)
6439 // lo1 = getreg(SHADER_CYCLES_LO)
6440 // hi2 = getreg(SHADER_CYCLES_HI)
6441 //
6442 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6443 // Otherwise there was overflow and the result is hi2:0. In both cases the
6444 // result should represent the actual time at some point during the sequence
6445 // of three getregs.
6446 using namespace AMDGPU::Hwreg;
6447 Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6448 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1)
6449 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
6450 Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6451 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1)
6452 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: 0, Values: 32));
6453 Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6454 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2)
6455 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
6456 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
6457 .addReg(RegNo: RegHi1)
6458 .addReg(RegNo: RegHi2);
6459 Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6460 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo)
6461 .addReg(RegNo: RegLo1)
6462 .addImm(Val: 0);
6463 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE))
6464 .add(MO: MI.getOperand(i: 0))
6465 .addReg(RegNo: RegLo)
6466 .addImm(Val: AMDGPU::sub0)
6467 .addReg(RegNo: RegHi2)
6468 .addImm(Val: AMDGPU::sub1);
6469 MI.eraseFromParent();
6470 return BB;
6471 }
6472 case AMDGPU::SI_INDIRECT_SRC_V1:
6473 case AMDGPU::SI_INDIRECT_SRC_V2:
6474 case AMDGPU::SI_INDIRECT_SRC_V3:
6475 case AMDGPU::SI_INDIRECT_SRC_V4:
6476 case AMDGPU::SI_INDIRECT_SRC_V5:
6477 case AMDGPU::SI_INDIRECT_SRC_V6:
6478 case AMDGPU::SI_INDIRECT_SRC_V7:
6479 case AMDGPU::SI_INDIRECT_SRC_V8:
6480 case AMDGPU::SI_INDIRECT_SRC_V9:
6481 case AMDGPU::SI_INDIRECT_SRC_V10:
6482 case AMDGPU::SI_INDIRECT_SRC_V11:
6483 case AMDGPU::SI_INDIRECT_SRC_V12:
6484 case AMDGPU::SI_INDIRECT_SRC_V16:
6485 case AMDGPU::SI_INDIRECT_SRC_V32:
6486 return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget());
6487 case AMDGPU::SI_INDIRECT_DST_V1:
6488 case AMDGPU::SI_INDIRECT_DST_V2:
6489 case AMDGPU::SI_INDIRECT_DST_V3:
6490 case AMDGPU::SI_INDIRECT_DST_V4:
6491 case AMDGPU::SI_INDIRECT_DST_V5:
6492 case AMDGPU::SI_INDIRECT_DST_V6:
6493 case AMDGPU::SI_INDIRECT_DST_V7:
6494 case AMDGPU::SI_INDIRECT_DST_V8:
6495 case AMDGPU::SI_INDIRECT_DST_V9:
6496 case AMDGPU::SI_INDIRECT_DST_V10:
6497 case AMDGPU::SI_INDIRECT_DST_V11:
6498 case AMDGPU::SI_INDIRECT_DST_V12:
6499 case AMDGPU::SI_INDIRECT_DST_V16:
6500 case AMDGPU::SI_INDIRECT_DST_V32:
6501 return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget());
6502 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6503 case AMDGPU::SI_KILL_I1_PSEUDO:
6504 return splitKillBlock(MI, BB);
6505 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6506 Register Dst = MI.getOperand(i: 0).getReg();
6507 const MachineOperand &Src0 = MI.getOperand(i: 1);
6508 const MachineOperand &Src1 = MI.getOperand(i: 2);
6509 Register SrcCond = MI.getOperand(i: 3).getReg();
6510
6511 Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6512 Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6513 const auto *CondRC = TRI->getWaveMaskRegClass();
6514 Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC);
6515
6516 const TargetRegisterClass *Src0RC = Src0.isReg()
6517 ? MRI.getRegClass(Reg: Src0.getReg())
6518 : &AMDGPU::VReg_64RegClass;
6519 const TargetRegisterClass *Src1RC = Src1.isReg()
6520 ? MRI.getRegClass(Reg: Src1.getReg())
6521 : &AMDGPU::VReg_64RegClass;
6522
6523 const TargetRegisterClass *Src0SubRC =
6524 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6525 const TargetRegisterClass *Src1SubRC =
6526 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6527
6528 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6529 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6530 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6531 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6532
6533 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6534 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6535 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6536 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6537
6538 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy).addReg(RegNo: SrcCond);
6539 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo)
6540 .addImm(Val: 0)
6541 .add(MO: Src0Sub0)
6542 .addImm(Val: 0)
6543 .add(MO: Src1Sub0)
6544 .addReg(RegNo: SrcCondCopy);
6545 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi)
6546 .addImm(Val: 0)
6547 .add(MO: Src0Sub1)
6548 .addImm(Val: 0)
6549 .add(MO: Src1Sub1)
6550 .addReg(RegNo: SrcCondCopy);
6551
6552 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
6553 .addReg(RegNo: DstLo)
6554 .addImm(Val: AMDGPU::sub0)
6555 .addReg(RegNo: DstHi)
6556 .addImm(Val: AMDGPU::sub1);
6557 MI.eraseFromParent();
6558 return BB;
6559 }
6560 case AMDGPU::SI_BR_UNDEF: {
6561 MachineInstr *Br = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6562 .add(MO: MI.getOperand(i: 0));
6563 Br->getOperand(i: 1).setIsUndef(); // read undef SCC
6564 MI.eraseFromParent();
6565 return BB;
6566 }
6567 case AMDGPU::ADJCALLSTACKUP:
6568 case AMDGPU::ADJCALLSTACKDOWN: {
6569 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6570 MachineInstrBuilder MIB(*MF, &MI);
6571 MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::ImplicitDefine)
6572 .addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::Implicit);
6573 return BB;
6574 }
6575 case AMDGPU::SI_CALL_ISEL: {
6576 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
6577
6578 MachineInstrBuilder MIB;
6579 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg);
6580
6581 for (const MachineOperand &MO : MI.operands())
6582 MIB.add(MO);
6583
6584 MIB.cloneMemRefs(OtherMI: MI);
6585 MI.eraseFromParent();
6586 return BB;
6587 }
6588 case AMDGPU::V_ADD_CO_U32_e32:
6589 case AMDGPU::V_SUB_CO_U32_e32:
6590 case AMDGPU::V_SUBREV_CO_U32_e32: {
6591 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6592 unsigned Opc = MI.getOpcode();
6593
6594 bool NeedClampOperand = false;
6595 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) {
6596 Opc = AMDGPU::getVOPe64(Opcode: Opc);
6597 NeedClampOperand = true;
6598 }
6599
6600 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg());
6601 if (TII->isVOP3(MI: *I)) {
6602 I.addReg(RegNo: TRI->getVCC(), Flags: RegState::Define);
6603 }
6604 I.add(MO: MI.getOperand(i: 1)).add(MO: MI.getOperand(i: 2));
6605 if (NeedClampOperand)
6606 I.addImm(Val: 0); // clamp bit for e64 encoding
6607
6608 TII->legalizeOperands(MI&: *I);
6609
6610 MI.eraseFromParent();
6611 return BB;
6612 }
6613 case AMDGPU::V_ADDC_U32_e32:
6614 case AMDGPU::V_SUBB_U32_e32:
6615 case AMDGPU::V_SUBBREV_U32_e32:
6616 // These instructions have an implicit use of vcc which counts towards the
6617 // constant bus limit.
6618 TII->legalizeOperands(MI);
6619 return BB;
6620 case AMDGPU::DS_GWS_INIT:
6621 case AMDGPU::DS_GWS_SEMA_BR:
6622 case AMDGPU::DS_GWS_BARRIER:
6623 case AMDGPU::DS_GWS_SEMA_V:
6624 case AMDGPU::DS_GWS_SEMA_P:
6625 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6626 // A s_waitcnt 0 is required to be the instruction immediately following.
6627 if (getSubtarget()->hasGWSAutoReplay()) {
6628 bundleInstWithWaitcnt(MI);
6629 return BB;
6630 }
6631
6632 return emitGWSMemViolTestLoop(MI, BB);
6633 case AMDGPU::S_SETREG_B32: {
6634 // Try to optimize cases that only set the denormal mode or rounding mode.
6635 //
6636 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6637 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6638 // instead.
6639 //
6640 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6641 // allow you to have a no side effect instruction in the output of a
6642 // sideeffecting pattern.
6643 auto [ID, Offset, Width] =
6644 AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: 1).getImm());
6645 if (ID != AMDGPU::Hwreg::ID_MODE)
6646 return BB;
6647
6648 const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width);
6649 const unsigned SetMask = WidthMask << Offset;
6650
6651 if (getSubtarget()->hasDenormModeInst()) {
6652 unsigned SetDenormOp = 0;
6653 unsigned SetRoundOp = 0;
6654
6655 // The dedicated instructions can only set the whole denorm or round mode
6656 // at once, not a subset of bits in either.
6657 if (SetMask ==
6658 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
6659 // If this fully sets both the round and denorm mode, emit the two
6660 // dedicated instructions for these.
6661 SetRoundOp = AMDGPU::S_ROUND_MODE;
6662 SetDenormOp = AMDGPU::S_DENORM_MODE;
6663 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6664 SetRoundOp = AMDGPU::S_ROUND_MODE;
6665 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6666 SetDenormOp = AMDGPU::S_DENORM_MODE;
6667 }
6668
6669 if (SetRoundOp || SetDenormOp) {
6670 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg());
6671 if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) {
6672 unsigned ImmVal = Def->getOperand(i: 1).getImm();
6673 if (SetRoundOp) {
6674 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp))
6675 .addImm(Val: ImmVal & 0xf);
6676
6677 // If we also have the denorm mode, get just the denorm mode bits.
6678 ImmVal >>= 4;
6679 }
6680
6681 if (SetDenormOp) {
6682 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp))
6683 .addImm(Val: ImmVal & 0xf);
6684 }
6685
6686 MI.eraseFromParent();
6687 return BB;
6688 }
6689 }
6690 }
6691
6692 // If only FP bits are touched, used the no side effects pseudo.
6693 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6694 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6695 MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode));
6696
6697 return BB;
6698 }
6699 case AMDGPU::S_INVERSE_BALLOT_U32:
6700 case AMDGPU::S_INVERSE_BALLOT_U64:
6701 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6702 // necessary. After that they are equivalent to a COPY.
6703 MI.setDesc(TII->get(Opcode: AMDGPU::COPY));
6704 return BB;
6705 case AMDGPU::ENDPGM_TRAP: {
6706 if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
6707 MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM));
6708 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
6709 return BB;
6710 }
6711
6712 // We need a block split to make the real endpgm a terminator. We also don't
6713 // want to break phis in successor blocks, so we can't just delete to the
6714 // end of the block.
6715
6716 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
6717 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6718 MF->push_back(MBB: TrapBB);
6719 // clang-format off
6720 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM))
6721 .addImm(Val: 0);
6722 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
6723 .addMBB(MBB: TrapBB);
6724 // clang-format on
6725
6726 BB->addSuccessor(Succ: TrapBB);
6727 MI.eraseFromParent();
6728 return SplitBB;
6729 }
6730 case AMDGPU::SIMULATED_TRAP: {
6731 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6732 MachineBasicBlock *SplitBB =
6733 TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
6734 MI.eraseFromParent();
6735 return SplitBB;
6736 }
6737 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6738 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6739 assert(MFI->isWholeWaveFunction());
6740
6741 // During ISel, it's difficult to propagate the original EXEC mask to use as
6742 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6743 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF&: *BB->getParent());
6744 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6745 Register OriginalExec = Setup->getOperand(i: 0).getReg();
6746 MF->getRegInfo().clearKillFlags(Reg: OriginalExec);
6747 MI.getOperand(i: 0).setReg(OriginalExec);
6748 return BB;
6749 }
6750 default:
6751 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6752 if (!MI.mayStore())
6753 AddMemOpInit(MI);
6754 return BB;
6755 }
6756 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
6757 }
6758}
6759
6760bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
6761 // This currently forces unfolding various combinations of fsub into fma with
6762 // free fneg'd operands. As long as we have fast FMA (controlled by
6763 // isFMAFasterThanFMulAndFAdd), we should perform these.
6764
6765 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6766 // most of these combines appear to be cycle neutral but save on instruction
6767 // count / code size.
6768 return true;
6769}
6770
6771bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
6772
6773EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
6774 EVT VT) const {
6775 if (!VT.isVector()) {
6776 return MVT::i1;
6777 }
6778 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements());
6779}
6780
6781MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
6782 // TODO: Should i16 be used always if legal? For now it would force VALU
6783 // shifts.
6784 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6785}
6786
6787LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
6788 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6789 ? Ty.changeElementSize(NewEltSize: 16)
6790 : Ty.changeElementSize(NewEltSize: 32);
6791}
6792
6793// Answering this is somewhat tricky and depends on the specific device which
6794// have different rates for fma or all f64 operations.
6795//
6796// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6797// regardless of which device (although the number of cycles differs between
6798// devices), so it is always profitable for f64.
6799//
6800// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6801// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6802// which we can always do even without fused FP ops since it returns the same
6803// result as the separate operations and since it is always full
6804// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6805// however does not support denormals, so we do report fma as faster if we have
6806// a fast fma device and require denormals.
6807//
6808bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
6809 EVT VT) const {
6810 VT = VT.getScalarType();
6811
6812 switch (VT.getSimpleVT().SimpleTy) {
6813 case MVT::f32: {
6814 // If mad is not available this depends only on if f32 fma is full rate.
6815 if (!Subtarget->hasMadMacF32Insts())
6816 return Subtarget->hasFastFMAF32();
6817
6818 // Otherwise f32 mad is always full rate and returns the same result as
6819 // the separate operations so should be preferred over fma.
6820 // However does not support denormals.
6821 if (!denormalModeIsFlushAllF32(MF))
6822 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6823
6824 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6825 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6826 }
6827 case MVT::f64:
6828 return true;
6829 case MVT::f16:
6830 case MVT::bf16:
6831 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6832 default:
6833 break;
6834 }
6835
6836 return false;
6837}
6838
6839bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
6840 LLT Ty) const {
6841 switch (Ty.getScalarSizeInBits()) {
6842 case 16:
6843 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16);
6844 case 32:
6845 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32);
6846 case 64:
6847 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64);
6848 default:
6849 break;
6850 }
6851
6852 return false;
6853}
6854
6855bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
6856 if (!Ty.isScalar())
6857 return false;
6858
6859 if (Ty.getScalarSizeInBits() == 16)
6860 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
6861 if (Ty.getScalarSizeInBits() == 32)
6862 return Subtarget->hasMadMacF32Insts() &&
6863 denormalModeIsFlushAllF32(MF: *MI.getMF());
6864
6865 return false;
6866}
6867
6868bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
6869 const SDNode *N) const {
6870 // TODO: Check future ftz flag
6871 // v_mad_f32/v_mac_f32 do not support denormals.
6872 EVT VT = N->getValueType(ResNo: 0);
6873 if (VT == MVT::f32)
6874 return Subtarget->hasMadMacF32Insts() &&
6875 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
6876 if (VT == MVT::f16) {
6877 return Subtarget->hasMadF16() &&
6878 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
6879 }
6880
6881 return false;
6882}
6883
6884//===----------------------------------------------------------------------===//
6885// Custom DAG Lowering Operations
6886//===----------------------------------------------------------------------===//
6887
6888// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6889// wider vector type is legal.
6890SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
6891 SelectionDAG &DAG) const {
6892 unsigned Opc = Op.getOpcode();
6893 EVT VT = Op.getValueType();
6894 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6895 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6896 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6897 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6898 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6899 VT == MVT::v32bf16);
6900
6901 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
6902
6903 SDLoc SL(Op);
6904 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo, Flags: Op->getFlags());
6905 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi, Flags: Op->getFlags());
6906
6907 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6908}
6909
6910// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6911// regression whereby extra unnecessary instructions were added to codegen
6912// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6913// instructions to extract the result from the vector.
6914SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6915 [[maybe_unused]] EVT VT = Op.getValueType();
6916
6917 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6918 VT == MVT::v16i32) &&
6919 "Unexpected ValueType.");
6920
6921 return DAG.UnrollVectorOp(N: Op.getNode());
6922}
6923
6924// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6925// wider vector type is legal.
6926SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
6927 SelectionDAG &DAG) const {
6928 unsigned Opc = Op.getOpcode();
6929 EVT VT = Op.getValueType();
6930 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6931 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6932 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6933 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6934 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6935 VT == MVT::v32bf16);
6936
6937 auto [Lo0, Hi0] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
6938 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
6939
6940 SDLoc SL(Op);
6941
6942 SDValue OpLo =
6943 DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, Flags: Op->getFlags());
6944 SDValue OpHi =
6945 DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, Flags: Op->getFlags());
6946
6947 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6948}
6949
6950SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
6951 SelectionDAG &DAG) const {
6952 unsigned Opc = Op.getOpcode();
6953 EVT VT = Op.getValueType();
6954 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6955 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6956 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6957 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6958 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6959 VT == MVT::v32bf16);
6960
6961 SDValue Op0 = Op.getOperand(i: 0);
6962 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6963 ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0)
6964 : std::pair(Op0, Op0);
6965
6966 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
6967 auto [Lo2, Hi2] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2);
6968
6969 SDLoc SL(Op);
6970 auto ResVT = DAG.GetSplitDestVTs(VT);
6971
6972 SDValue OpLo =
6973 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, Flags: Op->getFlags());
6974 SDValue OpHi =
6975 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, Flags: Op->getFlags());
6976
6977 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6978}
6979
6980SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
6981 switch (Op.getOpcode()) {
6982 default:
6983 return AMDGPUTargetLowering::LowerOperation(Op, DAG);
6984 case ISD::BRCOND:
6985 return LowerBRCOND(Op, DAG);
6986 case ISD::RETURNADDR:
6987 return LowerRETURNADDR(Op, DAG);
6988 case ISD::SPONENTRY:
6989 return LowerSPONENTRY(Op, DAG);
6990 case ISD::LOAD: {
6991 SDValue Result = LowerLOAD(Op, DAG);
6992 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6993 "Load should return a value and a chain");
6994 return Result;
6995 }
6996 case ISD::FSQRT: {
6997 EVT VT = Op.getValueType();
6998 if (VT == MVT::f32)
6999 return lowerFSQRTF32(Op, DAG);
7000 if (VT == MVT::f64)
7001 return lowerFSQRTF64(Op, DAG);
7002 return SDValue();
7003 }
7004 case ISD::FSIN:
7005 case ISD::FCOS:
7006 return LowerTrig(Op, DAG);
7007 case ISD::SELECT:
7008 return LowerSELECT(Op, DAG);
7009 case ISD::FDIV:
7010 return LowerFDIV(Op, DAG);
7011 case ISD::FFREXP:
7012 return LowerFFREXP(Op, DAG);
7013 case ISD::ATOMIC_CMP_SWAP:
7014 return LowerATOMIC_CMP_SWAP(Op, DAG);
7015 case ISD::STORE:
7016 return LowerSTORE(Op, DAG);
7017 case ISD::GlobalAddress: {
7018 MachineFunction &MF = DAG.getMachineFunction();
7019 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7020 return LowerGlobalAddress(MFI, Op, DAG);
7021 }
7022 case ISD::ExternalSymbol:
7023 return LowerExternalSymbol(Op, DAG);
7024 case ISD::INTRINSIC_WO_CHAIN:
7025 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7026 case ISD::INTRINSIC_W_CHAIN:
7027 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7028 case ISD::INTRINSIC_VOID:
7029 return LowerINTRINSIC_VOID(Op, DAG);
7030 case ISD::ADDRSPACECAST:
7031 return lowerADDRSPACECAST(Op, DAG);
7032 case ISD::INSERT_SUBVECTOR:
7033 return lowerINSERT_SUBVECTOR(Op, DAG);
7034 case ISD::INSERT_VECTOR_ELT:
7035 return lowerINSERT_VECTOR_ELT(Op, DAG);
7036 case ISD::EXTRACT_VECTOR_ELT:
7037 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7038 case ISD::VECTOR_SHUFFLE:
7039 return lowerVECTOR_SHUFFLE(Op, DAG);
7040 case ISD::SCALAR_TO_VECTOR:
7041 return lowerSCALAR_TO_VECTOR(Op, DAG);
7042 case ISD::BUILD_VECTOR:
7043 return lowerBUILD_VECTOR(Op, DAG);
7044 case ISD::FP_ROUND:
7045 case ISD::STRICT_FP_ROUND:
7046 return lowerFP_ROUND(Op, DAG);
7047 case ISD::TRAP:
7048 return lowerTRAP(Op, DAG);
7049 case ISD::DEBUGTRAP:
7050 return lowerDEBUGTRAP(Op, DAG);
7051 case ISD::ABS:
7052 case ISD::FABS:
7053 case ISD::FNEG:
7054 case ISD::FCANONICALIZE:
7055 case ISD::BSWAP:
7056 return splitUnaryVectorOp(Op, DAG);
7057 case ISD::FMINNUM:
7058 case ISD::FMAXNUM:
7059 return lowerFMINNUM_FMAXNUM(Op, DAG);
7060 case ISD::FMINIMUMNUM:
7061 case ISD::FMAXIMUMNUM:
7062 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7063 case ISD::FMINIMUM:
7064 case ISD::FMAXIMUM:
7065 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7066 case ISD::FLDEXP:
7067 case ISD::STRICT_FLDEXP:
7068 return lowerFLDEXP(Op, DAG);
7069 case ISD::FMA:
7070 return splitTernaryVectorOp(Op, DAG);
7071 case ISD::FP_TO_SINT:
7072 case ISD::FP_TO_UINT:
7073 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7074 Op.getValueType() == MVT::i16 &&
7075 Op.getOperand(i: 0).getValueType() == MVT::f32) {
7076 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7077 return Op;
7078 }
7079 return LowerFP_TO_INT(Op, DAG);
7080 case ISD::SHL:
7081 case ISD::SRA:
7082 case ISD::SRL:
7083 case ISD::ADD:
7084 case ISD::SUB:
7085 case ISD::SMIN:
7086 case ISD::SMAX:
7087 case ISD::UMIN:
7088 case ISD::UMAX:
7089 case ISD::FADD:
7090 case ISD::FMUL:
7091 case ISD::FMINNUM_IEEE:
7092 case ISD::FMAXNUM_IEEE:
7093 case ISD::UADDSAT:
7094 case ISD::USUBSAT:
7095 case ISD::SADDSAT:
7096 case ISD::SSUBSAT:
7097 return splitBinaryVectorOp(Op, DAG);
7098 case ISD::FCOPYSIGN:
7099 return lowerFCOPYSIGN(Op, DAG);
7100 case ISD::MUL:
7101 return lowerMUL(Op, DAG);
7102 case ISD::SMULO:
7103 case ISD::UMULO:
7104 return lowerXMULO(Op, DAG);
7105 case ISD::SMUL_LOHI:
7106 case ISD::UMUL_LOHI:
7107 return lowerXMUL_LOHI(Op, DAG);
7108 case ISD::DYNAMIC_STACKALLOC:
7109 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7110 case ISD::STACKSAVE:
7111 return LowerSTACKSAVE(Op, DAG);
7112 case ISD::GET_ROUNDING:
7113 return lowerGET_ROUNDING(Op, DAG);
7114 case ISD::SET_ROUNDING:
7115 return lowerSET_ROUNDING(Op, DAG);
7116 case ISD::PREFETCH:
7117 return lowerPREFETCH(Op, DAG);
7118 case ISD::FP_EXTEND:
7119 case ISD::STRICT_FP_EXTEND:
7120 return lowerFP_EXTEND(Op, DAG);
7121 case ISD::GET_FPENV:
7122 return lowerGET_FPENV(Op, DAG);
7123 case ISD::SET_FPENV:
7124 return lowerSET_FPENV(Op, DAG);
7125 case ISD::ROTR:
7126 return lowerROTR(Op, DAG);
7127 }
7128 return SDValue();
7129}
7130
7131// Used for D16: Casts the result of an instruction into the right vector,
7132// packs values if loads return unpacked values.
7133static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
7134 const SDLoc &DL, SelectionDAG &DAG,
7135 bool Unpacked) {
7136 if (!LoadVT.isVector())
7137 return Result;
7138
7139 // Cast back to the original packed type or to a larger type that is a
7140 // multiple of 32 bit for D16. Widening the return type is a required for
7141 // legalization.
7142 EVT FittingLoadVT = LoadVT;
7143 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7144 FittingLoadVT =
7145 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7146 NumElements: LoadVT.getVectorNumElements() + 1);
7147 }
7148
7149 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7150 // Truncate to v2i16/v4i16.
7151 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7152
7153 // Workaround legalizer not scalarizing truncate after vector op
7154 // legalization but not creating intermediate vector trunc.
7155 SmallVector<SDValue, 4> Elts;
7156 DAG.ExtractVectorElements(Op: Result, Args&: Elts);
7157 for (SDValue &Elt : Elts)
7158 Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt);
7159
7160 // Pad illegal v1i16/v3fi6 to v4i16
7161 if ((LoadVT.getVectorNumElements() % 2) == 1)
7162 Elts.push_back(Elt: DAG.getPOISON(VT: MVT::i16));
7163
7164 Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
7165
7166 // Bitcast to original type (v2f16/v4f16).
7167 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7168 }
7169
7170 // Cast back to the original packed type.
7171 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7172}
7173
7174SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7175 SelectionDAG &DAG,
7176 ArrayRef<SDValue> Ops,
7177 bool IsIntrinsic) const {
7178 SDLoc DL(M);
7179
7180 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7181 EVT LoadVT = M->getValueType(ResNo: 0);
7182
7183 EVT EquivLoadVT = LoadVT;
7184 if (LoadVT.isVector()) {
7185 if (Unpacked) {
7186 EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
7187 NumElements: LoadVT.getVectorNumElements());
7188 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7189 // Widen v3f16 to legal type
7190 EquivLoadVT =
7191 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7192 NumElements: LoadVT.getVectorNumElements() + 1);
7193 }
7194 }
7195
7196 // Change from v4f16/v2f16 to EquivLoadVT.
7197 SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other);
7198
7199 SDValue Load = DAG.getMemIntrinsicNode(
7200 Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, VTList, Ops,
7201 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
7202
7203 SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
7204
7205 return DAG.getMergeValues(Ops: {Adjusted, Load.getValue(R: 1)}, dl: DL);
7206}
7207
7208SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7209 SelectionDAG &DAG,
7210 ArrayRef<SDValue> Ops) const {
7211 SDLoc DL(M);
7212 EVT LoadVT = M->getValueType(ResNo: 0);
7213 EVT EltType = LoadVT.getScalarType();
7214 EVT IntVT = LoadVT.changeTypeToInteger();
7215
7216 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7217
7218 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7219 bool IsTFE = M->getNumValues() == 3;
7220
7221 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7222 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7223 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7224 : AMDGPUISD::BUFFER_LOAD;
7225
7226 if (IsD16) {
7227 return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7228 }
7229
7230 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7231 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7232 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(),
7233 IsTFE);
7234
7235 if (isTypeLegal(VT: LoadVT)) {
7236 return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
7237 MMO: M->getMemOperand(), DAG);
7238 }
7239
7240 EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
7241 SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other);
7242 SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
7243 MMO: M->getMemOperand(), DAG);
7244 return DAG.getMergeValues(
7245 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)},
7246 dl: DL);
7247}
7248
7249static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7250 SelectionDAG &DAG) {
7251 EVT VT = N->getValueType(ResNo: 0);
7252 unsigned CondCode = N->getConstantOperandVal(Num: 3);
7253 if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
7254 return DAG.getPOISON(VT);
7255
7256 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7257
7258 SDValue LHS = N->getOperand(Num: 1);
7259 SDValue RHS = N->getOperand(Num: 2);
7260
7261 SDLoc DL(N);
7262
7263 EVT CmpVT = LHS.getValueType();
7264 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) {
7265 unsigned PromoteOp =
7266 ICmpInst::isSigned(predicate: IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7267 LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS);
7268 RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS);
7269 }
7270
7271 ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
7272
7273 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7274 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7275
7276 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
7277 N3: DAG.getCondCode(Cond: CCOpcode));
7278 if (VT.bitsEq(VT: CCVT))
7279 return SetCC;
7280 return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
7281}
7282
7283static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7284 SelectionDAG &DAG) {
7285 EVT VT = N->getValueType(ResNo: 0);
7286
7287 unsigned CondCode = N->getConstantOperandVal(Num: 3);
7288 if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
7289 return DAG.getPOISON(VT);
7290
7291 SDValue Src0 = N->getOperand(Num: 1);
7292 SDValue Src1 = N->getOperand(Num: 2);
7293 EVT CmpVT = Src0.getValueType();
7294 SDLoc SL(N);
7295
7296 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) {
7297 Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
7298 Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
7299 }
7300
7301 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7302 ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
7303 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7304 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7305 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, N2: Src1,
7306 N3: DAG.getCondCode(Cond: CCOpcode));
7307 if (VT.bitsEq(VT: CCVT))
7308 return SetCC;
7309 return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
7310}
7311
7312static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
7313 SelectionDAG &DAG) {
7314 EVT VT = N->getValueType(ResNo: 0);
7315 SDValue Src = N->getOperand(Num: 1);
7316 SDLoc SL(N);
7317
7318 if (Src.getOpcode() == ISD::SETCC) {
7319 SDValue Op0 = Src.getOperand(i: 0);
7320 SDValue Op1 = Src.getOperand(i: 1);
7321 // Need to expand bfloat to float for comparison (setcc).
7322 if (Op0.getValueType() == MVT::bf16) {
7323 Op0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op0);
7324 Op1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op1);
7325 }
7326 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7327 return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Op0, N2: Op1, N3: Src.getOperand(i: 2));
7328 }
7329 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
7330 // (ballot 0) -> 0
7331 if (Arg->isZero())
7332 return DAG.getConstant(Val: 0, DL: SL, VT);
7333
7334 // (ballot 1) -> EXEC/EXEC_LO
7335 if (Arg->isOne()) {
7336 Register Exec;
7337 if (VT.getScalarSizeInBits() == 32)
7338 Exec = AMDGPU::EXEC_LO;
7339 else if (VT.getScalarSizeInBits() == 64)
7340 Exec = AMDGPU::EXEC;
7341 else
7342 return SDValue();
7343
7344 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
7345 }
7346 }
7347
7348 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7349 // ISD::SETNE)
7350 return DAG.getNode(
7351 Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32),
7352 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE));
7353}
7354
7355static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
7356 SelectionDAG &DAG) {
7357 EVT VT = N->getValueType(ResNo: 0);
7358 unsigned ValSize = VT.getSizeInBits();
7359 unsigned IID = N->getConstantOperandVal(Num: 0);
7360 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7361 IID == Intrinsic::amdgcn_permlanex16;
7362 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7363 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7364 SDLoc SL(N);
7365 MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize);
7366 const GCNSubtarget *ST = TLI.getSubtarget();
7367 unsigned SplitSize = 32;
7368 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7369 ST->hasDPALU_DPP() &&
7370 AMDGPU::isLegalDPALU_DPPControl(ST: *ST, DC: N->getConstantOperandVal(Num: 3)))
7371 SplitSize = 64;
7372
7373 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7374 SDValue Src2, MVT ValT) -> SDValue {
7375 SmallVector<SDValue, 8> Operands;
7376 switch (IID) {
7377 case Intrinsic::amdgcn_permlane16:
7378 case Intrinsic::amdgcn_permlanex16:
7379 case Intrinsic::amdgcn_update_dpp:
7380 Operands.push_back(Elt: N->getOperand(Num: 6));
7381 Operands.push_back(Elt: N->getOperand(Num: 5));
7382 Operands.push_back(Elt: N->getOperand(Num: 4));
7383 [[fallthrough]];
7384 case Intrinsic::amdgcn_writelane:
7385 Operands.push_back(Elt: Src2);
7386 [[fallthrough]];
7387 case Intrinsic::amdgcn_readlane:
7388 case Intrinsic::amdgcn_set_inactive:
7389 case Intrinsic::amdgcn_set_inactive_chain_arg:
7390 case Intrinsic::amdgcn_mov_dpp8:
7391 Operands.push_back(Elt: Src1);
7392 [[fallthrough]];
7393 case Intrinsic::amdgcn_readfirstlane:
7394 case Intrinsic::amdgcn_permlane64:
7395 Operands.push_back(Elt: Src0);
7396 break;
7397 default:
7398 llvm_unreachable("unhandled lane op");
7399 }
7400
7401 Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32));
7402 std::reverse(first: Operands.begin(), last: Operands.end());
7403
7404 if (SDNode *GL = N->getGluedNode()) {
7405 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7406 GL = GL->getOperand(Num: 0).getNode();
7407 Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7408 Operand: SDValue(GL, 0)));
7409 }
7410
7411 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands);
7412 };
7413
7414 SDValue Src0 = N->getOperand(Num: 1);
7415 SDValue Src1, Src2;
7416 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7417 IID == Intrinsic::amdgcn_mov_dpp8 ||
7418 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7419 Src1 = N->getOperand(Num: 2);
7420 if (IID == Intrinsic::amdgcn_writelane ||
7421 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7422 Src2 = N->getOperand(Num: 3);
7423 }
7424
7425 if (ValSize == SplitSize) {
7426 // Already legal
7427 return SDValue();
7428 }
7429
7430 if (ValSize < 32) {
7431 bool IsFloat = VT.isFloatingPoint();
7432 Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0,
7433 DL: SL, VT: MVT::i32);
7434
7435 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7436 Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1,
7437 DL: SL, VT: MVT::i32);
7438 }
7439
7440 if (IID == Intrinsic::amdgcn_writelane) {
7441 Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2,
7442 DL: SL, VT: MVT::i32);
7443 }
7444
7445 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7446 SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT);
7447 return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc;
7448 }
7449
7450 if (ValSize % SplitSize != 0)
7451 return SDValue();
7452
7453 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7454 EVT VT = N->getValueType(ResNo: 0);
7455 unsigned NE = VT.getVectorNumElements();
7456 EVT EltVT = VT.getVectorElementType();
7457 SmallVector<SDValue, 8> Scalars;
7458 unsigned NumOperands = N->getNumOperands();
7459 SmallVector<SDValue, 4> Operands(NumOperands);
7460 SDNode *GL = N->getGluedNode();
7461
7462 // only handle convergencectrl_glue
7463 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7464
7465 for (unsigned i = 0; i != NE; ++i) {
7466 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7467 ++j) {
7468 SDValue Operand = N->getOperand(Num: j);
7469 EVT OperandVT = Operand.getValueType();
7470 if (OperandVT.isVector()) {
7471 // A vector operand; extract a single element.
7472 EVT OperandEltVT = OperandVT.getVectorElementType();
7473 Operands[j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT,
7474 N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL));
7475 } else {
7476 // A scalar operand; just use it as is.
7477 Operands[j] = Operand;
7478 }
7479 }
7480
7481 if (GL)
7482 Operands[NumOperands - 1] =
7483 DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7484 Operand: SDValue(GL->getOperand(Num: 0).getNode(), 0));
7485
7486 Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands));
7487 }
7488
7489 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE);
7490 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars);
7491 };
7492
7493 if (VT.isVector()) {
7494 switch (MVT::SimpleValueType EltTy =
7495 VT.getVectorElementType().getSimpleVT().SimpleTy) {
7496 case MVT::i32:
7497 case MVT::f32:
7498 if (SplitSize == 32) {
7499 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7500 return unrollLaneOp(LaneOp.getNode());
7501 }
7502 [[fallthrough]];
7503 case MVT::i16:
7504 case MVT::f16:
7505 case MVT::bf16: {
7506 unsigned SubVecNumElt =
7507 SplitSize / VT.getVectorElementType().getSizeInBits();
7508 MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: SubVecNumElt);
7509 SmallVector<SDValue, 4> Pieces;
7510 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7511 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7512 Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0,
7513 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7514
7515 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7516 IsPermLane16)
7517 Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1,
7518 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7519
7520 if (IID == Intrinsic::amdgcn_writelane)
7521 Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2,
7522 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7523
7524 Pieces.push_back(
7525 Elt: IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7526 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7527 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7528 EltIdx += SubVecNumElt;
7529 }
7530 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces);
7531 }
7532 default:
7533 // Handle all other cases by bitcasting to i32 vectors
7534 break;
7535 }
7536 }
7537
7538 MVT VecVT =
7539 MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: SplitSize), NumElements: ValSize / SplitSize);
7540 Src0 = DAG.getBitcast(VT: VecVT, V: Src0);
7541
7542 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7543 Src1 = DAG.getBitcast(VT: VecVT, V: Src1);
7544
7545 if (IID == Intrinsic::amdgcn_writelane)
7546 Src2 = DAG.getBitcast(VT: VecVT, V: Src2);
7547
7548 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7549 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7550 return DAG.getBitcast(VT, V: UnrolledLaneOp);
7551}
7552
7553static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
7554 SelectionDAG &DAG) {
7555 EVT VT = N->getValueType(ResNo: 0);
7556
7557 if (VT.getSizeInBits() != 32)
7558 return SDValue();
7559
7560 SDLoc SL(N);
7561
7562 SDValue Value = N->getOperand(Num: 1);
7563 SDValue Index = N->getOperand(Num: 2);
7564
7565 // ds_bpermute requires index to be multiplied by 4
7566 SDValue ShiftAmount = DAG.getShiftAmountConstant(Val: 2, VT: MVT::i32, DL: SL);
7567 SDValue ShiftedIndex =
7568 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: Index.getValueType(), N1: Index, N2: ShiftAmount);
7569
7570 // Intrinsics will require i32 to operate on
7571 SDValue ValueI32 = DAG.getBitcast(VT: MVT::i32, V: Value);
7572
7573 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7574 SmallVector<SDValue> IntrinArgs) -> SDValue {
7575 SmallVector<SDValue> Operands(1);
7576 Operands[0] = DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32);
7577 Operands.append(RHS: IntrinArgs);
7578 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: RetVT, Ops: Operands);
7579 };
7580
7581 // If we can bpermute across the whole wave, then just do that
7582 if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7583 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7584 {ShiftedIndex, ValueI32});
7585 return DAG.getBitcast(VT, V: BPermute);
7586 }
7587
7588 assert(TLI.getSubtarget()->isWave64());
7589
7590 // Otherwise, we need to make use of whole wave mode
7591 SDValue PoisonVal = DAG.getPOISON(VT: ValueI32->getValueType(ResNo: 0));
7592
7593 // Set inactive lanes to poison
7594 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7595 {ValueI32, PoisonVal});
7596 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7597 {ShiftedIndex, PoisonVal});
7598
7599 SDValue Swapped =
7600 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7601
7602 // Get permutation of each half, then we'll select which one to use
7603 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7604 {WWMIndex, WWMValue});
7605 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7606 MVT::i32, {WWMIndex, Swapped});
7607 SDValue BPermOtherHalfWWM =
7608 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7609
7610 // Select which side to take the permute from
7611 SDValue ThreadIDMask = DAG.getAllOnesConstant(DL: SL, VT: MVT::i32);
7612 // We can get away with only using mbcnt_lo here since we're only
7613 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7614 // returns 32 for lanes 32-63.
7615 SDValue ThreadID =
7616 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7617 {ThreadIDMask, DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32)});
7618
7619 SDValue SameOrOtherHalf =
7620 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32,
7621 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: ThreadID, N2: Index),
7622 N2: DAG.getTargetConstant(Val: 32, DL: SL, VT: MVT::i32));
7623 SDValue UseSameHalf =
7624 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SameOrOtherHalf,
7625 RHS: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond: ISD::SETEQ);
7626 SDValue Result = DAG.getSelect(DL: SL, VT: MVT::i32, Cond: UseSameHalf, LHS: BPermSameHalf,
7627 RHS: BPermOtherHalfWWM);
7628 return DAG.getBitcast(VT, V: Result);
7629}
7630
7631void SITargetLowering::ReplaceNodeResults(SDNode *N,
7632 SmallVectorImpl<SDValue> &Results,
7633 SelectionDAG &DAG) const {
7634 switch (N->getOpcode()) {
7635 case ISD::INSERT_VECTOR_ELT: {
7636 if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
7637 Results.push_back(Elt: Res);
7638 return;
7639 }
7640 case ISD::EXTRACT_VECTOR_ELT: {
7641 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
7642 Results.push_back(Elt: Res);
7643 return;
7644 }
7645 case ISD::INTRINSIC_WO_CHAIN: {
7646 unsigned IID = N->getConstantOperandVal(Num: 0);
7647 switch (IID) {
7648 case Intrinsic::amdgcn_make_buffer_rsrc:
7649 Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
7650 return;
7651 case Intrinsic::amdgcn_cvt_pkrtz: {
7652 SDValue Src0 = N->getOperand(Num: 1);
7653 SDValue Src1 = N->getOperand(Num: 2);
7654 SDLoc SL(N);
7655 SDValue Cvt =
7656 DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
7657 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt));
7658 return;
7659 }
7660 case Intrinsic::amdgcn_cvt_pknorm_i16:
7661 case Intrinsic::amdgcn_cvt_pknorm_u16:
7662 case Intrinsic::amdgcn_cvt_pk_i16:
7663 case Intrinsic::amdgcn_cvt_pk_u16: {
7664 SDValue Src0 = N->getOperand(Num: 1);
7665 SDValue Src1 = N->getOperand(Num: 2);
7666 SDLoc SL(N);
7667 unsigned Opcode;
7668
7669 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7670 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7671 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7672 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7673 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7674 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7675 else
7676 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7677
7678 EVT VT = N->getValueType(ResNo: 0);
7679 if (isTypeLegal(VT))
7680 Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
7681 else {
7682 SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
7683 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt));
7684 }
7685 return;
7686 }
7687 case Intrinsic::amdgcn_s_buffer_load: {
7688 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7689 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7690 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7691 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7692 // s_buffer_load_i8.
7693 if (!Subtarget->hasScalarSubwordLoads())
7694 return;
7695 SDValue Op = SDValue(N, 0);
7696 SDValue Rsrc = Op.getOperand(i: 1);
7697 SDValue Offset = Op.getOperand(i: 2);
7698 SDValue CachePolicy = Op.getOperand(i: 3);
7699 EVT VT = Op.getValueType();
7700 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7701 SDLoc DL(Op);
7702 MachineFunction &MF = DAG.getMachineFunction();
7703 const DataLayout &DataLayout = DAG.getDataLayout();
7704 Align Alignment =
7705 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
7706 MachineMemOperand *MMO = MF.getMachineMemOperand(
7707 PtrInfo: MachinePointerInfo(),
7708 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7709 MachineMemOperand::MOInvariant,
7710 Size: VT.getStoreSize(), BaseAlignment: Alignment);
7711 SDValue LoadVal;
7712 if (!Offset->isDivergent()) {
7713 SDValue Ops[] = {Rsrc, // source register
7714 Offset, CachePolicy};
7715 SDValue BufferLoad =
7716 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL,
7717 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
7718 LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
7719 } else {
7720 SDValue Ops[] = {
7721 DAG.getEntryNode(), // Chain
7722 Rsrc, // rsrc
7723 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
7724 {}, // voffset
7725 {}, // soffset
7726 {}, // offset
7727 CachePolicy, // cachepolicy
7728 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
7729 };
7730 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
7731 LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
7732 }
7733 Results.push_back(Elt: LoadVal);
7734 return;
7735 }
7736 case Intrinsic::amdgcn_dead: {
7737 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7738 Results.push_back(Elt: DAG.getPOISON(VT: N->getValueType(ResNo: I)));
7739 return;
7740 }
7741 }
7742 break;
7743 }
7744 case ISD::INTRINSIC_W_CHAIN: {
7745 if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) {
7746 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7747 // FIXME: Hacky
7748 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7749 Results.push_back(Elt: Res.getOperand(i: I));
7750 }
7751 } else {
7752 Results.push_back(Elt: Res);
7753 Results.push_back(Elt: Res.getValue(R: 1));
7754 }
7755 return;
7756 }
7757
7758 break;
7759 }
7760 case ISD::SELECT: {
7761 SDLoc SL(N);
7762 EVT VT = N->getValueType(ResNo: 0);
7763 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
7764 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1));
7765 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2));
7766
7767 EVT SelectVT = NewVT;
7768 if (NewVT.bitsLT(VT: MVT::i32)) {
7769 LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS);
7770 RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS);
7771 SelectVT = MVT::i32;
7772 }
7773
7774 SDValue NewSelect =
7775 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
7776
7777 if (NewVT != SelectVT)
7778 NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
7779 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
7780 return;
7781 }
7782 case ISD::FNEG: {
7783 if (N->getValueType(ResNo: 0) != MVT::v2f16)
7784 break;
7785
7786 SDLoc SL(N);
7787 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
7788
7789 SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: BC,
7790 N2: DAG.getConstant(Val: 0x80008000, DL: SL, VT: MVT::i32));
7791 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
7792 return;
7793 }
7794 case ISD::FABS: {
7795 if (N->getValueType(ResNo: 0) != MVT::v2f16)
7796 break;
7797
7798 SDLoc SL(N);
7799 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
7800
7801 SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: BC,
7802 N2: DAG.getConstant(Val: 0x7fff7fff, DL: SL, VT: MVT::i32));
7803 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
7804 return;
7805 }
7806 case ISD::FSQRT: {
7807 if (N->getValueType(ResNo: 0) != MVT::f16)
7808 break;
7809 Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG));
7810 break;
7811 }
7812 default:
7813 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
7814 break;
7815 }
7816}
7817
7818/// Helper function for LowerBRCOND
7819static SDNode *findUser(SDValue Value, unsigned Opcode) {
7820
7821 for (SDUse &U : Value->uses()) {
7822 if (U.get() != Value)
7823 continue;
7824
7825 if (U.getUser()->getOpcode() == Opcode)
7826 return U.getUser();
7827 }
7828 return nullptr;
7829}
7830
7831unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7832 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7833 switch (Intr->getConstantOperandVal(Num: 1)) {
7834 case Intrinsic::amdgcn_if:
7835 return AMDGPUISD::IF;
7836 case Intrinsic::amdgcn_else:
7837 return AMDGPUISD::ELSE;
7838 case Intrinsic::amdgcn_loop:
7839 return AMDGPUISD::LOOP;
7840 case Intrinsic::amdgcn_end_cf:
7841 llvm_unreachable("should not occur");
7842 default:
7843 return 0;
7844 }
7845 }
7846
7847 // break, if_break, else_break are all only used as inputs to loop, not
7848 // directly as branch conditions.
7849 return 0;
7850}
7851
7852bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
7853 const Triple &TT = getTargetMachine().getTargetTriple();
7854 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7855 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7856 AMDGPU::shouldEmitConstantsToTextSection(TT);
7857}
7858
7859bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
7860 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7861 return false;
7862
7863 // FIXME: Either avoid relying on address space here or change the default
7864 // address space for functions to avoid the explicit check.
7865 return (GV->getValueType()->isFunctionTy() ||
7866 !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
7867 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
7868}
7869
7870bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
7871 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7872}
7873
7874bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
7875 if (!GV->hasExternalLinkage())
7876 return true;
7877
7878 const auto OS = getTargetMachine().getTargetTriple().getOS();
7879 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7880}
7881
7882/// This transforms the control flow intrinsics to get the branch destination as
7883/// last parameter, also switches branch target with BR if the need arise
7884SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7885 SDLoc DL(BRCOND);
7886
7887 SDNode *Intr = BRCOND.getOperand(i: 1).getNode();
7888 SDValue Target = BRCOND.getOperand(i: 2);
7889 SDNode *BR = nullptr;
7890 SDNode *SetCC = nullptr;
7891
7892 switch (Intr->getOpcode()) {
7893 case ISD::SETCC: {
7894 // As long as we negate the condition everything is fine
7895 SetCC = Intr;
7896 Intr = SetCC->getOperand(Num: 0).getNode();
7897 break;
7898 }
7899 case ISD::XOR: {
7900 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7901 SDValue LHS = Intr->getOperand(Num: 0);
7902 SDValue RHS = Intr->getOperand(Num: 1);
7903 if (auto *C = dyn_cast<ConstantSDNode>(Val&: RHS); C && C->getZExtValue()) {
7904 Intr = LHS.getNode();
7905 break;
7906 }
7907 [[fallthrough]];
7908 }
7909 default: {
7910 // Get the target from BR if we don't negate the condition
7911 BR = findUser(Value: BRCOND, Opcode: ISD::BR);
7912 assert(BR && "brcond missing unconditional branch user");
7913 Target = BR->getOperand(Num: 1);
7914 }
7915 }
7916
7917 unsigned CFNode = isCFIntrinsic(Intr);
7918 if (CFNode == 0) {
7919 // This is a uniform branch so we don't need to legalize.
7920 return BRCOND;
7921 }
7922
7923 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7924 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
7925
7926 assert(!SetCC ||
7927 (SetCC->getConstantOperandVal(1) == 1 &&
7928 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7929 ISD::SETNE));
7930
7931 // operands of the new intrinsic call
7932 SmallVector<SDValue, 4> Ops;
7933 if (HaveChain)
7934 Ops.push_back(Elt: BRCOND.getOperand(i: 0));
7935
7936 Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end());
7937 Ops.push_back(Elt: Target);
7938
7939 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7940
7941 // build the new intrinsic call
7942 SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
7943
7944 if (!HaveChain) {
7945 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(i: 0)};
7946
7947 Result = DAG.getMergeValues(Ops, dl: DL).getNode();
7948 }
7949
7950 if (BR) {
7951 // Give the branch instruction our target
7952 SDValue Ops[] = {BR->getOperand(Num: 0), BRCOND.getOperand(i: 2)};
7953 SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
7954 DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
7955 }
7956
7957 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7958
7959 // Copy the intrinsic results to registers
7960 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7961 SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg);
7962 if (!CopyToReg)
7963 continue;
7964
7965 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: CopyToReg->getOperand(Num: 1),
7966 N: SDValue(Result, i - 1), Glue: SDValue());
7967
7968 DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0));
7969 }
7970
7971 // Remove the old intrinsic from the chain
7972 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Intr, Intr->getNumValues() - 1),
7973 To: Intr->getOperand(Num: 0));
7974
7975 return Chain;
7976}
7977
7978SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7979 MVT VT = Op.getSimpleValueType();
7980 SDLoc DL(Op);
7981 // Checking the depth
7982 if (Op.getConstantOperandVal(i: 0) != 0)
7983 return DAG.getConstant(Val: 0, DL, VT);
7984
7985 MachineFunction &MF = DAG.getMachineFunction();
7986 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7987 // Check for kernel and shader functions
7988 if (Info->isEntryFunction())
7989 return DAG.getConstant(Val: 0, DL, VT);
7990
7991 MachineFrameInfo &MFI = MF.getFrameInfo();
7992 // There is a call to @llvm.returnaddress in this function
7993 MFI.setReturnAddressIsTaken(true);
7994
7995 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7996 // Get the return address reg and mark it as an implicit live-in
7997 Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF),
7998 RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
7999
8000 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
8001}
8002
8003SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8004 MachineFunction &MF = DAG.getMachineFunction();
8005 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8006
8007 // For functions that set up their own stack, select the GET_STACK_BASE
8008 // pseudo.
8009 if (MFI->isBottomOfStack())
8010 return Op;
8011
8012 // For everything else, create a dummy stack object.
8013 int FI = MF.getFrameInfo().CreateFixedObject(Size: 1, SPOffset: 0, /*IsImmutable=*/false);
8014 return DAG.getFrameIndex(FI, VT: Op.getValueType());
8015}
8016
8017SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8018 const SDLoc &DL, EVT VT) const {
8019 return Op.getValueType().bitsLE(VT)
8020 ? DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op)
8021 : DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op,
8022 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
8023}
8024
8025SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8026 SelectionDAG &DAG) const {
8027 EVT DstVT = Op.getValueType();
8028 unsigned NumElts = DstVT.getVectorNumElements();
8029 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8030
8031 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
8032
8033 SDLoc DL(Op);
8034 unsigned Opc = Op.getOpcode();
8035 SDValue Flags = Op.getOperand(i: 1);
8036 EVT HalfDstVT =
8037 EVT::getVectorVT(Context&: *DAG.getContext(), VT: DstVT.getScalarType(), NumElements: NumElts / 2);
8038 SDValue OpLo = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Lo, N2: Flags);
8039 SDValue OpHi = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Hi, N2: Flags);
8040
8041 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: OpLo, N2: OpHi);
8042}
8043
8044SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8045 SDValue Src = Op.getOperand(i: 0);
8046 EVT SrcVT = Src.getValueType();
8047 EVT DstVT = Op.getValueType();
8048
8049 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8050 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8051 if (SrcVT.getScalarType() != MVT::f32)
8052 return SDValue();
8053 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8054 }
8055
8056 if (SrcVT.getScalarType() != MVT::f64)
8057 return Op;
8058
8059 SDLoc DL(Op);
8060 if (DstVT == MVT::f16) {
8061 // TODO: Handle strictfp
8062 if (Op.getOpcode() != ISD::FP_ROUND)
8063 return Op;
8064
8065 if (!Subtarget->has16BitInsts()) {
8066 SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src);
8067 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8068 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8069 }
8070 if (Op->getFlags().hasApproximateFuncs()) {
8071 SDValue Flags = Op.getOperand(i: 1);
8072 SDValue Src32 = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f32, N1: Src, N2: Flags);
8073 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: Src32, N2: Flags);
8074 }
8075 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8076 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8077 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8078 }
8079
8080 assert(DstVT.getScalarType() == MVT::bf16 &&
8081 "custom lower FP_ROUND for f16 or bf16");
8082 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8083
8084 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8085 // hardware f32 -> bf16 instruction.
8086 EVT F32VT = SrcVT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::f32);
8087 SDValue Rod = expandRoundInexactToOdd(ResultVT: F32VT, Op: Src, DL, DAG);
8088 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: DstVT, N1: Rod,
8089 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
8090}
8091
8092SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8093 SelectionDAG &DAG) const {
8094 EVT VT = Op.getValueType();
8095 const MachineFunction &MF = DAG.getMachineFunction();
8096 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8097 bool IsIEEEMode = Info->getMode().IEEE;
8098
8099 // FIXME: Assert during selection that this is only selected for
8100 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8101 // mode functions, but this happens to be OK since it's only done in cases
8102 // where there is known no sNaN.
8103 if (IsIEEEMode)
8104 return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
8105
8106 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8107 VT == MVT::v16bf16)
8108 return splitBinaryVectorOp(Op, DAG);
8109 return Op;
8110}
8111
8112SDValue
8113SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8114 SelectionDAG &DAG) const {
8115 EVT VT = Op.getValueType();
8116 const MachineFunction &MF = DAG.getMachineFunction();
8117 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8118 bool IsIEEEMode = Info->getMode().IEEE;
8119
8120 if (IsIEEEMode)
8121 return expandFMINIMUMNUM_FMAXIMUMNUM(N: Op.getNode(), DAG);
8122
8123 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8124 VT == MVT::v16bf16)
8125 return splitBinaryVectorOp(Op, DAG);
8126 return Op;
8127}
8128
8129SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8130 SelectionDAG &DAG) const {
8131 EVT VT = Op.getValueType();
8132 if (VT.isVector())
8133 return splitBinaryVectorOp(Op, DAG);
8134
8135 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8136 !Subtarget->hasMinimum3Maximum3F16() &&
8137 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8138 "should not need to widen f16 minimum/maximum to v2f16");
8139
8140 // Widen f16 operation to v2f16
8141
8142 // fminimum f16:x, f16:y ->
8143 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8144 // (v2f16 (scalar_to_vector y))), 0
8145 SDLoc SL(Op);
8146 SDValue WideSrc0 =
8147 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 0));
8148 SDValue WideSrc1 =
8149 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 1));
8150
8151 SDValue Widened =
8152 DAG.getNode(Opcode: Op.getOpcode(), DL: SL, VT: MVT::v2f16, N1: WideSrc0, N2: WideSrc1);
8153
8154 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::f16, N1: Widened,
8155 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
8156}
8157
8158SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8159 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8160 EVT VT = Op.getValueType();
8161 assert(VT == MVT::f16);
8162
8163 SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1);
8164 EVT ExpVT = Exp.getValueType();
8165 if (ExpVT == MVT::i16)
8166 return Op;
8167
8168 SDLoc DL(Op);
8169
8170 // Correct the exponent type for f16 to i16.
8171 // Clamp the range of the exponent to the instruction's range.
8172
8173 // TODO: This should be a generic narrowing legalization, and can easily be
8174 // for GlobalISel.
8175
8176 SDValue MinExp = DAG.getSignedConstant(Val: minIntN(N: 16), DL, VT: ExpVT);
8177 SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
8178
8179 SDValue MaxExp = DAG.getSignedConstant(Val: maxIntN(N: 16), DL, VT: ExpVT);
8180 SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
8181
8182 SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp);
8183
8184 if (IsStrict) {
8185 return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other},
8186 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), TruncExp});
8187 }
8188
8189 return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp);
8190}
8191
8192static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
8193 switch (Op->getOpcode()) {
8194 case ISD::SRA:
8195 case ISD::SMIN:
8196 case ISD::SMAX:
8197 return ISD::SIGN_EXTEND;
8198 case ISD::SRL:
8199 case ISD::UMIN:
8200 case ISD::UMAX:
8201 return ISD::ZERO_EXTEND;
8202 case ISD::ADD:
8203 case ISD::SUB:
8204 case ISD::AND:
8205 case ISD::OR:
8206 case ISD::XOR:
8207 case ISD::SHL:
8208 case ISD::SELECT:
8209 case ISD::MUL:
8210 // operation result won't be influenced by garbage high bits.
8211 // TODO: are all of those cases correct, and are there more?
8212 return ISD::ANY_EXTEND;
8213 case ISD::SETCC: {
8214 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
8215 return ISD::isSignedIntSetCC(Code: CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8216 }
8217 default:
8218 llvm_unreachable("unexpected opcode!");
8219 }
8220}
8221
8222SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8223 DAGCombinerInfo &DCI) const {
8224 const unsigned Opc = Op.getOpcode();
8225 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8226 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8227 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8228 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8229 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8230
8231 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8232 : Op->getOperand(Num: 0).getValueType();
8233 auto &DAG = DCI.DAG;
8234 auto ExtTy = OpTy.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
8235
8236 if (DCI.isBeforeLegalizeOps() ||
8237 isNarrowingProfitable(N: Op.getNode(), SrcVT: ExtTy, DestVT: OpTy))
8238 return SDValue();
8239
8240 SDLoc DL(Op);
8241 SDValue LHS;
8242 SDValue RHS;
8243 if (Opc == ISD::SELECT) {
8244 LHS = Op->getOperand(Num: 1);
8245 RHS = Op->getOperand(Num: 2);
8246 } else {
8247 LHS = Op->getOperand(Num: 0);
8248 RHS = Op->getOperand(Num: 1);
8249 }
8250
8251 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8252 LHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {LHS});
8253
8254 // Special case: for shifts, the RHS always needs a zext.
8255 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8256 RHS = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: ExtTy, Operand: {RHS});
8257 else
8258 RHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {RHS});
8259
8260 // setcc always return i1/i1 vec so no need to truncate after.
8261 if (Opc == ISD::SETCC) {
8262 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
8263 return DAG.getSetCC(DL, VT: Op.getValueType(), LHS, RHS, Cond: CC);
8264 }
8265
8266 // For other ops, we extend the operation's return type as well so we need to
8267 // truncate back to the original type.
8268 SDValue NewVal;
8269 if (Opc == ISD::SELECT)
8270 NewVal = DAG.getNode(Opcode: ISD::SELECT, DL, VT: ExtTy, Ops: {Op->getOperand(Num: 0), LHS, RHS});
8271 else
8272 NewVal = DAG.getNode(Opcode: Opc, DL, VT: ExtTy, Ops: {LHS, RHS});
8273
8274 return DAG.getZExtOrTrunc(Op: NewVal, DL, VT: OpTy);
8275}
8276
8277SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8278 SDValue Mag = Op.getOperand(i: 0);
8279 EVT MagVT = Mag.getValueType();
8280
8281 if (MagVT.getVectorNumElements() > 2)
8282 return splitBinaryVectorOp(Op, DAG);
8283
8284 SDValue Sign = Op.getOperand(i: 1);
8285 EVT SignVT = Sign.getValueType();
8286
8287 if (MagVT == SignVT)
8288 return Op;
8289
8290 // fcopysign v2f16:mag, v2f32:sign ->
8291 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8292
8293 SDLoc SL(Op);
8294 SDValue SignAsInt32 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Sign);
8295 SDValue SignAsInt16 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::v2i16, Operand: SignAsInt32);
8296
8297 SDValue SignAsHalf16 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MagVT, Operand: SignAsInt16);
8298
8299 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MagVT, N1: Mag, N2: SignAsHalf16);
8300}
8301
8302// Custom lowering for vector multiplications and s_mul_u64.
8303SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8304 EVT VT = Op.getValueType();
8305
8306 // Split vector operands.
8307 if (VT.isVector())
8308 return splitBinaryVectorOp(Op, DAG);
8309
8310 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8311
8312 // There are four ways to lower s_mul_u64:
8313 //
8314 // 1. If all the operands are uniform, then we lower it as it is.
8315 //
8316 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8317 // multiplications because there is not a vector equivalent of s_mul_u64.
8318 //
8319 // 3. If the cost model decides that it is more efficient to use vector
8320 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8321 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8322 //
8323 // 4. If the cost model decides to use vector registers and both of the
8324 // operands are zero-extended/sign-extended from 32-bits, then we split the
8325 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8326 // possible to check if the operands are zero-extended or sign-extended in
8327 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8328 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8329 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8330 // If the cost model decides that we have to use vector registers, then
8331 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8332 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8333 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8334 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8335 // SIInstrInfo.cpp .
8336
8337 if (Op->isDivergent())
8338 return SDValue();
8339
8340 SDValue Op0 = Op.getOperand(i: 0);
8341 SDValue Op1 = Op.getOperand(i: 1);
8342 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8343 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8344 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8345 KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
8346 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8347 KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
8348 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8349 SDLoc SL(Op);
8350 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8351 return SDValue(
8352 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
8353 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
8354 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
8355 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8356 return SDValue(
8357 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
8358 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8359 return Op;
8360}
8361
8362SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8363 EVT VT = Op.getValueType();
8364 SDLoc SL(Op);
8365 SDValue LHS = Op.getOperand(i: 0);
8366 SDValue RHS = Op.getOperand(i: 1);
8367 bool isSigned = Op.getOpcode() == ISD::SMULO;
8368
8369 if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
8370 const APInt &C = RHSC->getAPIntValue();
8371 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8372 if (C.isPowerOf2()) {
8373 // smulo(x, signed_min) is same as umulo(x, signed_min).
8374 bool UseArithShift = isSigned && !C.isMinSignedValue();
8375 SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32);
8376 SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
8377 SDValue Overflow =
8378 DAG.getSetCC(DL: SL, VT: MVT::i1,
8379 LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL, DL: SL, VT,
8380 N1: Result, N2: ShiftAmt),
8381 RHS: LHS, Cond: ISD::SETNE);
8382 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
8383 }
8384 }
8385
8386 SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
8387 SDValue Top =
8388 DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, DL: SL, VT, N1: LHS, N2: RHS);
8389
8390 SDValue Sign = isSigned
8391 ? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result,
8392 N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - 1,
8393 DL: SL, VT: MVT::i32))
8394 : DAG.getConstant(Val: 0, DL: SL, VT);
8395 SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE);
8396
8397 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
8398}
8399
8400SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8401 if (Op->isDivergent()) {
8402 // Select to V_MAD_[IU]64_[IU]32.
8403 return Op;
8404 }
8405 if (Subtarget->hasSMulHi()) {
8406 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8407 return SDValue();
8408 }
8409 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8410 // calculate the high part, so we might as well do the whole thing with
8411 // V_MAD_[IU]64_[IU]32.
8412 return Op;
8413}
8414
8415SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8416 if (!Subtarget->hasTrapHandler() ||
8417 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8418 return lowerTrapEndpgm(Op, DAG);
8419
8420 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8421 : lowerTrapHsaQueuePtr(Op, DAG);
8422}
8423
8424SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8425 SDLoc SL(Op);
8426 SDValue Chain = Op.getOperand(i: 0);
8427 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
8428}
8429
8430SDValue
8431SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8432 const SDLoc &DL, Align Alignment,
8433 ImplicitParameter Param) const {
8434 MachineFunction &MF = DAG.getMachineFunction();
8435 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8436 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
8437 MachinePointerInfo PtrInfo =
8438 getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
8439 return DAG.getLoad(
8440 VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
8441 MMOFlags: MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
8442}
8443
8444SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8445 SelectionDAG &DAG) const {
8446 SDLoc SL(Op);
8447 SDValue Chain = Op.getOperand(i: 0);
8448
8449 SDValue QueuePtr;
8450 // For code object version 5, QueuePtr is passed through implicit kernarg.
8451 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8452 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
8453 QueuePtr =
8454 loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align(8), Param: QUEUE_PTR);
8455 } else {
8456 MachineFunction &MF = DAG.getMachineFunction();
8457 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8458 Register UserSGPR = Info->getQueuePtrUserSGPR();
8459
8460 if (UserSGPR == AMDGPU::NoRegister) {
8461 // We probably are in a function incorrectly marked with
8462 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8463 // trap, so just use a null pointer.
8464 QueuePtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
8465 } else {
8466 QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR,
8467 VT: MVT::i64);
8468 }
8469 }
8470
8471 SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64);
8472 SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, N: QueuePtr, Glue: SDValue());
8473
8474 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8475 SDValue Ops[] = {ToReg, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16), SGPR01,
8476 ToReg.getValue(R: 1)};
8477 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8478}
8479
8480SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8481 SDLoc SL(Op);
8482 SDValue Chain = Op.getOperand(i: 0);
8483
8484 // We need to simulate the 's_trap 2' instruction on targets that run in
8485 // PRIV=1 (where it is treated as a nop).
8486 if (Subtarget->hasPrivEnabledTrap2NopBug())
8487 return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
8488
8489 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8490 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
8491 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8492}
8493
8494SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8495 SDLoc SL(Op);
8496 SDValue Chain = Op.getOperand(i: 0);
8497 MachineFunction &MF = DAG.getMachineFunction();
8498
8499 if (!Subtarget->hasTrapHandler() ||
8500 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8501 LLVMContext &Ctx = MF.getFunction().getContext();
8502 Ctx.diagnose(DI: DiagnosticInfoUnsupported(MF.getFunction(),
8503 "debugtrap handler not supported",
8504 Op.getDebugLoc(), DS_Warning));
8505 return Chain;
8506 }
8507
8508 uint64_t TrapID =
8509 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8510 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
8511 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8512}
8513
8514SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8515 SelectionDAG &DAG) const {
8516 if (Subtarget->hasApertureRegs()) {
8517 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8518 ? AMDGPU::SRC_SHARED_BASE
8519 : AMDGPU::SRC_PRIVATE_BASE;
8520 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8521 !Subtarget->hasGloballyAddressableScratch()) &&
8522 "Cannot use src_private_base with globally addressable scratch!");
8523 // Note: this feature (register) is broken. When used as a 32-bit operand,
8524 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8525 // bits.
8526 //
8527 // To work around the issue, emit a 64 bit copy from this register
8528 // then extract the high bits. Note that this shouldn't even result in a
8529 // shift being emitted and simply become a pair of registers (e.g.):
8530 // s_mov_b64 s[6:7], src_shared_base
8531 // v_mov_b32_e32 v1, s7
8532 SDValue Copy =
8533 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ApertureRegNo, VT: MVT::v2i32);
8534 return DAG.getExtractVectorElt(DL, VT: MVT::i32, Vec: Copy, Idx: 1);
8535 }
8536
8537 // For code object version 5, private_base and shared_base are passed through
8538 // implicit kernargs.
8539 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8540 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
8541 ImplicitParameter Param =
8542 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
8543 return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align(4), Param);
8544 }
8545
8546 MachineFunction &MF = DAG.getMachineFunction();
8547 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8548 Register UserSGPR = Info->getQueuePtrUserSGPR();
8549 if (UserSGPR == AMDGPU::NoRegister) {
8550 // We probably are in a function incorrectly marked with
8551 // amdgpu-no-queue-ptr. This is undefined.
8552 return DAG.getPOISON(VT: MVT::i32);
8553 }
8554
8555 SDValue QueuePtr =
8556 CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64);
8557
8558 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8559 // private_segment_aperture_base_hi.
8560 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8561
8562 SDValue Ptr =
8563 DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
8564
8565 // TODO: Use custom target PseudoSourceValue.
8566 // TODO: We should use the value from the IR intrinsic call, but it might not
8567 // be available and how do we get it?
8568 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8569 return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: 1), Ptr, PtrInfo,
8570 Alignment: commonAlignment(A: Align(64), Offset: StructOffset),
8571 MMOFlags: MachineMemOperand::MODereferenceable |
8572 MachineMemOperand::MOInvariant);
8573}
8574
8575/// Return true if the value is a known valid address, such that a null check is
8576/// not necessary.
8577static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
8578 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8579 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
8580 return true;
8581
8582 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8583 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8584
8585 // TODO: Search through arithmetic, handle arguments and loads
8586 // marked nonnull.
8587 return false;
8588}
8589
8590SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8591 SelectionDAG &DAG) const {
8592 SDLoc SL(Op);
8593
8594 const AMDGPUTargetMachine &TM =
8595 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8596
8597 unsigned DestAS, SrcAS;
8598 SDValue Src;
8599 bool IsNonNull = false;
8600 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
8601 SrcAS = ASC->getSrcAddressSpace();
8602 Src = ASC->getOperand(Num: 0);
8603 DestAS = ASC->getDestAddressSpace();
8604 } else {
8605 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8606 Op.getConstantOperandVal(0) ==
8607 Intrinsic::amdgcn_addrspacecast_nonnull);
8608 Src = Op->getOperand(Num: 1);
8609 SrcAS = Op->getConstantOperandVal(Num: 2);
8610 DestAS = Op->getConstantOperandVal(Num: 3);
8611 IsNonNull = true;
8612 }
8613
8614 SDValue FlatNullPtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
8615
8616 // flat -> local/private
8617 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8618 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8619 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8620 SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
8621
8622 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8623 Subtarget->hasGloballyAddressableScratch()) {
8624 // flat -> private with globally addressable scratch: subtract
8625 // src_flat_scratch_base_lo.
8626 SDValue FlatScratchBaseLo(
8627 DAG.getMachineNode(
8628 Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32,
8629 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, VT: MVT::i32)),
8630 0);
8631 Ptr = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: Ptr, N2: FlatScratchBaseLo);
8632 }
8633
8634 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
8635 return Ptr;
8636
8637 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
8638 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
8639 SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE);
8640
8641 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr,
8642 N3: SegmentNullPtr);
8643 }
8644 }
8645
8646 // local/private -> flat
8647 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8648 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8649 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8650 SDValue CvtPtr;
8651 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8652 Subtarget->hasGloballyAddressableScratch()) {
8653 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8654 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8655 SDValue AllOnes = DAG.getSignedTargetConstant(Val: -1, DL: SL, VT: MVT::i32);
8656 SDValue ThreadID = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
8657 ThreadID = DAG.getNode(
8658 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
8659 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_lo, DL: SL, VT: MVT::i32),
8660 N2: AllOnes, N3: ThreadID);
8661 if (Subtarget->isWave64())
8662 ThreadID = DAG.getNode(
8663 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
8664 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_hi, DL: SL, VT: MVT::i32),
8665 N2: AllOnes, N3: ThreadID);
8666 SDValue ShAmt = DAG.getShiftAmountConstant(
8667 Val: 57 - 32 - Subtarget->getWavefrontSizeLog2(), VT: MVT::i32, DL: SL);
8668 SDValue SrcHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ThreadID, N2: ShAmt);
8669 CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: SrcHi);
8670 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
8671 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8672 // 64-bit hi:lo value.
8673 SDValue FlatScratchBase = {
8674 DAG.getMachineNode(
8675 Opcode: AMDGPU::S_MOV_B64, dl: SL, VT: MVT::i64,
8676 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE, VT: MVT::i64)),
8677 0};
8678 CvtPtr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: CvtPtr, N2: FlatScratchBase);
8679 } else {
8680 SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
8681 CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture);
8682 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
8683 }
8684
8685 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
8686 return CvtPtr;
8687
8688 unsigned NullVal = TM.getNullPointerValue(AddrSpace: SrcAS);
8689 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
8690
8691 SDValue NonNull =
8692 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE);
8693
8694 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr,
8695 N3: FlatNullPtr);
8696 }
8697 }
8698
8699 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8700 Op.getValueType() == MVT::i64) {
8701 const SIMachineFunctionInfo *Info =
8702 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8703 if (Info->get32BitAddressHighBits() == 0)
8704 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: Src);
8705
8706 SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32);
8707 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi);
8708 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
8709 }
8710
8711 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8712 Src.getValueType() == MVT::i64)
8713 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
8714
8715 // global <-> flat are no-ops and never emitted.
8716
8717 // Invalid casts are poison.
8718 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
8719}
8720
8721// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8722// the small vector and inserting them into the big vector. That is better than
8723// the default expansion of doing it via a stack slot. Even though the use of
8724// the stack slot would be optimized away afterwards, the stack slot itself
8725// remains.
8726SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8727 SelectionDAG &DAG) const {
8728 SDValue Vec = Op.getOperand(i: 0);
8729 SDValue Ins = Op.getOperand(i: 1);
8730 SDValue Idx = Op.getOperand(i: 2);
8731 EVT VecVT = Vec.getValueType();
8732 EVT InsVT = Ins.getValueType();
8733 EVT EltVT = VecVT.getVectorElementType();
8734 unsigned InsNumElts = InsVT.getVectorNumElements();
8735 unsigned IdxVal = Idx->getAsZExtVal();
8736 SDLoc SL(Op);
8737
8738 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8739 // Insert 32-bit registers at a time.
8740 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8741
8742 unsigned VecNumElts = VecVT.getVectorNumElements();
8743 EVT NewVecVT =
8744 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / 2);
8745 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8746 : EVT::getVectorVT(Context&: *DAG.getContext(),
8747 VT: MVT::i32, NumElements: InsNumElts / 2);
8748
8749 Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
8750 Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
8751
8752 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8753 SDValue Elt;
8754 if (InsNumElts == 2) {
8755 Elt = Ins;
8756 } else {
8757 Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins,
8758 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
8759 }
8760 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt,
8761 N3: DAG.getConstant(Val: IdxVal / 2 + I, DL: SL, VT: MVT::i32));
8762 }
8763
8764 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
8765 }
8766
8767 for (unsigned I = 0; I != InsNumElts; ++I) {
8768 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins,
8769 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
8770 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt,
8771 N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32));
8772 }
8773 return Vec;
8774}
8775
8776SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8777 SelectionDAG &DAG) const {
8778 SDValue Vec = Op.getOperand(i: 0);
8779 SDValue InsVal = Op.getOperand(i: 1);
8780 SDValue Idx = Op.getOperand(i: 2);
8781 EVT VecVT = Vec.getValueType();
8782 EVT EltVT = VecVT.getVectorElementType();
8783 unsigned VecSize = VecVT.getSizeInBits();
8784 unsigned EltSize = EltVT.getSizeInBits();
8785 SDLoc SL(Op);
8786
8787 // Specially handle the case of v4i16 with static indexing.
8788 unsigned NumElts = VecVT.getVectorNumElements();
8789 auto *KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
8790 if (NumElts == 4 && EltSize == 16 && KIdx) {
8791 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec);
8792
8793 SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
8794 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
8795 SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
8796 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
8797
8798 SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf);
8799 SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf);
8800
8801 unsigned Idx = KIdx->getZExtValue();
8802 bool InsertLo = Idx < 2;
8803 SDValue InsHalf = DAG.getNode(
8804 Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16, N1: InsertLo ? LoVec : HiVec,
8805 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal),
8806 N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - 2), DL: SL, VT: MVT::i32));
8807
8808 InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf);
8809
8810 SDValue Concat =
8811 InsertLo ? DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {InsHalf, HiHalf})
8812 : DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoHalf, InsHalf});
8813
8814 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
8815 }
8816
8817 // Static indexing does not lower to stack access, and hence there is no need
8818 // for special custom lowering to avoid stack access.
8819 if (isa<ConstantSDNode>(Val: Idx))
8820 return SDValue();
8821
8822 // Avoid stack access for dynamic indexing by custom lowering to
8823 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8824
8825 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8826
8827 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
8828
8829 // Convert vector index to bit-index and get the required bit mask.
8830 assert(isPowerOf2_32(EltSize));
8831 const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
8832 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
8833 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
8834 SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
8835 N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
8836
8837 // 1. Create a congruent vector with the target value in each element.
8838 SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
8839 Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
8840
8841 // 2. Mask off all other indices except the required index within (1).
8842 SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
8843
8844 // 3. Mask off the required index within the target vector.
8845 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
8846 SDValue RHS =
8847 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
8848
8849 // 4. Get (2) and (3) ORed into the target vector.
8850 SDValue BFI =
8851 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS, Flags: SDNodeFlags::Disjoint);
8852
8853 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
8854}
8855
8856SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8857 SelectionDAG &DAG) const {
8858 SDLoc SL(Op);
8859
8860 EVT ResultVT = Op.getValueType();
8861 SDValue Vec = Op.getOperand(i: 0);
8862 SDValue Idx = Op.getOperand(i: 1);
8863 EVT VecVT = Vec.getValueType();
8864 unsigned VecSize = VecVT.getSizeInBits();
8865 EVT EltVT = VecVT.getVectorElementType();
8866
8867 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8868
8869 // Make sure we do any optimizations that will make it easier to fold
8870 // source modifiers before obscuring it with bit operations.
8871
8872 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8873 if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
8874 return Combined;
8875
8876 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8877 SDValue Lo, Hi;
8878 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT: VecVT);
8879
8880 if (VecSize == 128) {
8881 SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec);
8882 Lo = DAG.getBitcast(VT: LoVT,
8883 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8884 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)));
8885 Hi = DAG.getBitcast(VT: HiVT,
8886 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8887 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)));
8888 } else if (VecSize == 256) {
8889 SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec);
8890 SDValue Parts[4];
8891 for (unsigned P = 0; P < 4; ++P) {
8892 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8893 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
8894 }
8895
8896 Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
8897 N1: Parts[0], N2: Parts[1]));
8898 Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
8899 N1: Parts[2], N2: Parts[3]));
8900 } else {
8901 assert(VecSize == 512);
8902
8903 SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec);
8904 SDValue Parts[8];
8905 for (unsigned P = 0; P < 8; ++P) {
8906 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8907 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
8908 }
8909
8910 Lo = DAG.getBitcast(VT: LoVT,
8911 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
8912 N1: Parts[0], N2: Parts[1], N3: Parts[2], N4: Parts[3]));
8913 Hi = DAG.getBitcast(VT: HiVT,
8914 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
8915 N1: Parts[4], N2: Parts[5], N3: Parts[6], N4: Parts[7]));
8916 }
8917
8918 EVT IdxVT = Idx.getValueType();
8919 unsigned NElem = VecVT.getVectorNumElements();
8920 assert(isPowerOf2_32(NElem));
8921 SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT);
8922 SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
8923 SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
8924 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
8925 }
8926
8927 assert(VecSize <= 64);
8928
8929 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
8930
8931 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8932 SDValue VecBC = peekThroughBitcasts(V: Vec);
8933 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8934 SDValue Src = VecBC.getOperand(i: 0);
8935 Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
8936 Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
8937 }
8938
8939 unsigned EltSize = EltVT.getSizeInBits();
8940 assert(isPowerOf2_32(EltSize));
8941
8942 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
8943
8944 // Convert vector index to bit-index (* EltSize)
8945 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
8946
8947 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
8948 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
8949
8950 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8951 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt);
8952 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
8953 }
8954
8955 return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
8956}
8957
8958static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8959 assert(Elt % 2 == 0);
8960 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8961}
8962
8963static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8964 assert(Elt % 2 == 0);
8965 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8966 !(Mask[Elt + 1] & 1);
8967}
8968
8969SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8970 SelectionDAG &DAG) const {
8971 SDLoc SL(Op);
8972 EVT ResultVT = Op.getValueType();
8973 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
8974 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8975 const int NewSrcNumElts = 2;
8976 MVT PackVT = MVT::getVectorVT(VT: EltVT, NumElements: NewSrcNumElts);
8977 int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements();
8978
8979 // Break up the shuffle into registers sized pieces.
8980 //
8981 // We're trying to form sub-shuffles that the register allocation pipeline
8982 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8983 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8984 // pair of copies into a consecutive register copy, so use the ordinary
8985 // extract_vector_elt lowering unless we can use the shuffle.
8986 //
8987 // TODO: This is a bit of hack, and we should probably always use
8988 // extract_subvector for the largest possible subvector we can (or at least
8989 // use it for PackVT aligned pieces). However we have worse support for
8990 // combines on them don't directly treat extract_subvector / insert_subvector
8991 // as legal. The DAG scheduler also ends up doing a worse job with the
8992 // extract_subvectors.
8993 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8994
8995 // vector_shuffle <0,1,6,7> lhs, rhs
8996 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8997 //
8998 // vector_shuffle <6,7,2,3> lhs, rhs
8999 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9000 //
9001 // vector_shuffle <6,7,0,1> lhs, rhs
9002 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9003
9004 // Avoid scalarizing when both halves are reading from consecutive elements.
9005
9006 // If we're treating 2 element shuffles as legal, also create odd-to-even
9007 // shuffles of neighboring pairs.
9008 //
9009 // vector_shuffle <3,2,7,6> lhs, rhs
9010 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9011 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9012
9013 SmallVector<SDValue, 16> Pieces;
9014 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9015 if (ShouldUseConsecutiveExtract &&
9016 elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
9017 const int Idx = SVN->getMaskElt(Idx: I);
9018 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9019 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9020 SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT,
9021 N1: SVN->getOperand(Num: VecIdx),
9022 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
9023 Pieces.push_back(Elt: SubVec);
9024 } else if (elementPairIsOddToEven(Mask: SVN->getMask(), Elt: I) &&
9025 isOperationLegal(Op: ISD::VECTOR_SHUFFLE, VT: PackVT)) {
9026 int Idx0 = SVN->getMaskElt(Idx: I);
9027 int Idx1 = SVN->getMaskElt(Idx: I + 1);
9028
9029 SDValue SrcOp0 = SVN->getOperand(Num: 0);
9030 SDValue SrcOp1 = SrcOp0;
9031 if (Idx0 >= SrcNumElts) {
9032 SrcOp0 = SVN->getOperand(Num: 1);
9033 Idx0 -= SrcNumElts;
9034 }
9035
9036 if (Idx1 >= SrcNumElts) {
9037 SrcOp1 = SVN->getOperand(Num: 1);
9038 Idx1 -= SrcNumElts;
9039 }
9040
9041 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9042 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9043
9044 // Extract nearest even aligned piece.
9045 SDValue SubVec0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp0,
9046 N2: DAG.getConstant(Val: AlignedIdx0, DL: SL, VT: MVT::i32));
9047 SDValue SubVec1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp1,
9048 N2: DAG.getConstant(Val: AlignedIdx1, DL: SL, VT: MVT::i32));
9049
9050 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9051 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9052
9053 SDValue Result0 = SubVec0;
9054 SDValue Result1 = SubVec0;
9055
9056 if (SubVec0 != SubVec1) {
9057 NewMaskIdx1 += NewSrcNumElts;
9058 Result1 = SubVec1;
9059 } else {
9060 Result1 = DAG.getPOISON(VT: PackVT);
9061 }
9062
9063 SDValue Shuf = DAG.getVectorShuffle(VT: PackVT, dl: SL, N1: Result0, N2: Result1,
9064 Mask: {NewMaskIdx0, NewMaskIdx1});
9065 Pieces.push_back(Elt: Shuf);
9066 } else {
9067 const int Idx0 = SVN->getMaskElt(Idx: I);
9068 const int Idx1 = SVN->getMaskElt(Idx: I + 1);
9069 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9070 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9071 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9072 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9073
9074 SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
9075 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec0,
9076 N2: DAG.getSignedConstant(Val: EltIdx0, DL: SL, VT: MVT::i32));
9077
9078 SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
9079 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec1,
9080 N2: DAG.getSignedConstant(Val: EltIdx1, DL: SL, VT: MVT::i32));
9081 Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: {Elt0, Elt1}));
9082 }
9083 }
9084
9085 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
9086}
9087
9088SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9089 SelectionDAG &DAG) const {
9090 SDValue SVal = Op.getOperand(i: 0);
9091 EVT ResultVT = Op.getValueType();
9092 EVT SValVT = SVal.getValueType();
9093 SDValue UndefVal = DAG.getPOISON(VT: SValVT);
9094 SDLoc SL(Op);
9095
9096 SmallVector<SDValue, 8> VElts;
9097 VElts.push_back(Elt: SVal);
9098 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9099 VElts.push_back(Elt: UndefVal);
9100
9101 return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
9102}
9103
9104SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9105 SelectionDAG &DAG) const {
9106 SDLoc SL(Op);
9107 EVT VT = Op.getValueType();
9108
9109 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9110 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9111
9112 SDValue Lo = Op.getOperand(i: 0);
9113 SDValue Hi = Op.getOperand(i: 1);
9114
9115 // Avoid adding defined bits with the zero_extend.
9116 if (Hi.isUndef()) {
9117 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9118 SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9119 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
9120 }
9121
9122 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi);
9123 Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi);
9124
9125 SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi,
9126 N2: DAG.getConstant(Val: 16, DL: SL, VT: MVT::i32));
9127 if (Lo.isUndef())
9128 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
9129
9130 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9131 Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9132
9133 SDValue Or =
9134 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi, Flags: SDNodeFlags::Disjoint);
9135 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
9136 }
9137
9138 // Split into 2-element chunks.
9139 const unsigned NumParts = VT.getVectorNumElements() / 2;
9140 EVT PartVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), NumElements: 2);
9141 MVT PartIntVT = MVT::getIntegerVT(BitWidth: PartVT.getSizeInBits());
9142
9143 SmallVector<SDValue> Casts;
9144 for (unsigned P = 0; P < NumParts; ++P) {
9145 SDValue Vec = DAG.getBuildVector(
9146 VT: PartVT, DL: SL, Ops: {Op.getOperand(i: P * 2), Op.getOperand(i: P * 2 + 1)});
9147 Casts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: PartIntVT, Operand: Vec));
9148 }
9149
9150 SDValue Blend =
9151 DAG.getBuildVector(VT: MVT::getVectorVT(VT: PartIntVT, NumElements: NumParts), DL: SL, Ops: Casts);
9152 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
9153}
9154
9155bool SITargetLowering::isOffsetFoldingLegal(
9156 const GlobalAddressSDNode *GA) const {
9157 // OSes that use ELF REL relocations (instead of RELA) can only store a
9158 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9159 // which can create arbitrary 64-bit addends. (This is only a problem for
9160 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9161 // the high 32 bits of the addend.)
9162 //
9163 // This should be kept in sync with how HasRelocationAddend is initialized in
9164 // the constructor of ELFAMDGPUAsmBackend.
9165 if (!Subtarget->isAmdHsaOS())
9166 return false;
9167
9168 // We can fold offsets for anything that doesn't require a GOT relocation.
9169 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9170 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
9171 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
9172 !shouldEmitGOTReloc(GV: GA->getGlobal());
9173}
9174
9175static SDValue
9176buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
9177 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9178 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9179 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9180 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9181 // lowered to the following code sequence:
9182 //
9183 // For constant address space:
9184 // s_getpc_b64 s[0:1]
9185 // s_add_u32 s0, s0, $symbol
9186 // s_addc_u32 s1, s1, 0
9187 //
9188 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9189 // a fixup or relocation is emitted to replace $symbol with a literal
9190 // constant, which is a pc-relative offset from the encoding of the $symbol
9191 // operand to the global variable.
9192 //
9193 // For global address space:
9194 // s_getpc_b64 s[0:1]
9195 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9196 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9197 //
9198 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9199 // fixups or relocations are emitted to replace $symbol@*@lo and
9200 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9201 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9202 // operand to the global variable.
9203 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9204 assert(GAFlags != SIInstrInfo::MO_NONE);
9205
9206 SDValue Ptr =
9207 DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset, TargetFlags: GAFlags + 2);
9208 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET64, DL, VT: PtrVT, Operand: Ptr);
9209 }
9210
9211 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags);
9212 SDValue PtrHi;
9213 if (GAFlags == SIInstrInfo::MO_NONE)
9214 PtrHi = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
9215 else
9216 PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + 1);
9217 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
9218}
9219
9220SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9221 SDValue Op,
9222 SelectionDAG &DAG) const {
9223 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
9224 SDLoc DL(GSD);
9225 EVT PtrVT = Op.getValueType();
9226
9227 const GlobalValue *GV = GSD->getGlobal();
9228 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9229 shouldUseLDSConstAddress(GV)) ||
9230 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
9231 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
9232 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9233 GV->hasExternalLinkage()) {
9234 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
9235 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9236 // zero-sized type in other languages to declare the dynamic shared
9237 // memory which size is not known at the compile time. They will be
9238 // allocated by the runtime and placed directly after the static
9239 // allocated ones. They all share the same offset.
9240 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
9241 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9242 // Adjust alignment for that dynamic shared memory array.
9243 Function &F = DAG.getMachineFunction().getFunction();
9244 MFI->setDynLDSAlign(F, GV: GVar);
9245 MFI->setUsesDynamicLDS(true);
9246 return SDValue(
9247 DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), 0);
9248 }
9249 }
9250 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
9251 }
9252
9253 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
9254 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(),
9255 TargetFlags: SIInstrInfo::MO_ABS32_LO);
9256 return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA);
9257 }
9258
9259 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9260 if (Subtarget->has64BitLiterals()) {
9261 SDValue Addr = DAG.getTargetGlobalAddress(
9262 GV, DL, VT: MVT::i64, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS64);
9263 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64, Op1: Addr),
9264 0);
9265 }
9266
9267 SDValue AddrLo = DAG.getTargetGlobalAddress(
9268 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO);
9269 AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), 0};
9270
9271 SDValue AddrHi = DAG.getTargetGlobalAddress(
9272 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI);
9273 AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), 0};
9274
9275 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi);
9276 }
9277
9278 if (shouldEmitFixup(GV))
9279 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
9280
9281 if (shouldEmitPCReloc(GV))
9282 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
9283 GAFlags: SIInstrInfo::MO_REL32);
9284
9285 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT,
9286 GAFlags: SIInstrInfo::MO_GOTPCREL32);
9287 PointerType *PtrTy =
9288 PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
9289 const DataLayout &DataLayout = DAG.getDataLayout();
9290 Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
9291 MachinePointerInfo PtrInfo =
9292 MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
9293
9294 return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
9295 MMOFlags: MachineMemOperand::MODereferenceable |
9296 MachineMemOperand::MOInvariant);
9297}
9298
9299SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9300 SelectionDAG &DAG) const {
9301 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9302 const Function &Fn = DAG.getMachineFunction().getFunction();
9303 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9304 Fn, "unsupported external symbol", Op.getDebugLoc()));
9305 return DAG.getPOISON(VT: Op.getValueType());
9306}
9307
9308SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
9309 const SDLoc &DL, SDValue V) const {
9310 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9311 // the destination register.
9312 //
9313 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9314 // so we will end up with redundant moves to m0.
9315 //
9316 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9317
9318 // A Null SDValue creates a glue result.
9319 SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue,
9320 Op1: V, Op2: Chain);
9321 return SDValue(M0, 0);
9322}
9323
9324SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9325 MVT VT,
9326 unsigned Offset) const {
9327 SDLoc SL(Op);
9328 SDValue Param = lowerKernargMemParameter(
9329 DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align(4), Signed: false);
9330 // The local size values will have the hi 16-bits as zero.
9331 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param,
9332 N2: DAG.getValueType(VT));
9333}
9334
9335static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
9336 EVT VT) {
9337 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9338 DAG.getMachineFunction().getFunction(),
9339 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9340 return DAG.getPOISON(VT);
9341}
9342
9343static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
9344 EVT VT) {
9345 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9346 DAG.getMachineFunction().getFunction(),
9347 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9348 return DAG.getPOISON(VT);
9349}
9350
9351static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
9352 ArrayRef<SDValue> Elts) {
9353 assert(!Elts.empty());
9354 MVT Type;
9355 unsigned NumElts = Elts.size();
9356
9357 if (NumElts <= 12) {
9358 Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
9359 } else {
9360 assert(Elts.size() <= 16);
9361 Type = MVT::v16f32;
9362 NumElts = 16;
9363 }
9364
9365 SmallVector<SDValue, 16> VecElts(NumElts);
9366 for (unsigned i = 0; i < Elts.size(); ++i) {
9367 SDValue Elt = Elts[i];
9368 if (Elt.getValueType() != MVT::f32)
9369 Elt = DAG.getBitcast(VT: MVT::f32, V: Elt);
9370 VecElts[i] = Elt;
9371 }
9372 for (unsigned i = Elts.size(); i < NumElts; ++i)
9373 VecElts[i] = DAG.getPOISON(VT: MVT::f32);
9374
9375 if (NumElts == 1)
9376 return VecElts[0];
9377 return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
9378}
9379
9380static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9381 SDValue Src, int ExtraElts) {
9382 EVT SrcVT = Src.getValueType();
9383
9384 SmallVector<SDValue, 8> Elts;
9385
9386 if (SrcVT.isVector())
9387 DAG.ExtractVectorElements(Op: Src, Args&: Elts);
9388 else
9389 Elts.push_back(Elt: Src);
9390
9391 SDValue Undef = DAG.getPOISON(VT: SrcVT.getScalarType());
9392 while (ExtraElts--)
9393 Elts.push_back(Elt: Undef);
9394
9395 return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
9396}
9397
9398// Re-construct the required return value for a image load intrinsic.
9399// This is more complicated due to the optional use TexFailCtrl which means the
9400// required return type is an aggregate
9401static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
9402 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9403 bool Unpacked, bool IsD16, int DMaskPop,
9404 int NumVDataDwords, bool IsAtomicPacked16Bit,
9405 const SDLoc &DL) {
9406 // Determine the required return type. This is the same regardless of
9407 // IsTexFail flag
9408 EVT ReqRetVT = ResultTypes[0];
9409 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9410 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9411 ? (ReqRetNumElts + 1) / 2
9412 : ReqRetNumElts;
9413
9414 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9415
9416 MVT DataDwordVT =
9417 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords);
9418
9419 MVT MaskPopVT =
9420 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords);
9421
9422 SDValue Data(Result, 0);
9423 SDValue TexFail;
9424
9425 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9426 SDValue ZeroIdx = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
9427 if (MaskPopVT.isVector()) {
9428 Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
9429 N1: SDValue(Result, 0), N2: ZeroIdx);
9430 } else {
9431 Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
9432 N1: SDValue(Result, 0), N2: ZeroIdx);
9433 }
9434 }
9435
9436 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9437 Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
9438 ExtraElts: NumDataDwords - MaskPopDwords);
9439
9440 if (IsD16)
9441 Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
9442
9443 EVT LegalReqRetVT = ReqRetVT;
9444 if (!ReqRetVT.isVector()) {
9445 if (!Data.getValueType().isInteger())
9446 Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
9447 VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
9448 Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
9449 } else {
9450 // We need to widen the return vector to a legal type
9451 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9452 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9453 LegalReqRetVT =
9454 EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
9455 NumElements: ReqRetVT.getVectorNumElements() + 1);
9456 }
9457 }
9458 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
9459
9460 if (IsTexFail) {
9461 TexFail =
9462 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue(Result, 0),
9463 N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32));
9464
9465 return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL);
9466 }
9467
9468 if (Result->getNumValues() == 1)
9469 return Data;
9470
9471 return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL);
9472}
9473
9474static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9475 SDValue *LWE, bool &IsTexFail) {
9476 auto *TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
9477
9478 uint64_t Value = TexFailCtrlConst->getZExtValue();
9479 if (Value) {
9480 IsTexFail = true;
9481 }
9482
9483 SDLoc DL(TexFailCtrlConst);
9484 *TFE = DAG.getTargetConstant(Val: (Value & 0x1) ? 1 : 0, DL, VT: MVT::i32);
9485 Value &= ~(uint64_t)0x1;
9486 *LWE = DAG.getTargetConstant(Val: (Value & 0x2) ? 1 : 0, DL, VT: MVT::i32);
9487 Value &= ~(uint64_t)0x2;
9488
9489 return Value == 0;
9490}
9491
9492static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
9493 MVT PackVectorVT,
9494 SmallVectorImpl<SDValue> &PackedAddrs,
9495 unsigned DimIdx, unsigned EndIdx,
9496 unsigned NumGradients) {
9497 SDLoc DL(Op);
9498 for (unsigned I = DimIdx; I < EndIdx; I++) {
9499 SDValue Addr = Op.getOperand(i: I);
9500
9501 // Gradients are packed with undef for each coordinate.
9502 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9503 // 1D: undef,dx/dh; undef,dx/dv
9504 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9505 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9506 if (((I + 1) >= EndIdx) ||
9507 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9508 I == DimIdx + NumGradients - 1))) {
9509 if (Addr.getValueType() != MVT::i16)
9510 Addr = DAG.getBitcast(VT: MVT::i16, V: Addr);
9511 Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr);
9512 } else {
9513 Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)});
9514 I++;
9515 }
9516 Addr = DAG.getBitcast(VT: MVT::f32, V: Addr);
9517 PackedAddrs.push_back(Elt: Addr);
9518 }
9519}
9520
9521SDValue SITargetLowering::lowerImage(SDValue Op,
9522 const AMDGPU::ImageDimIntrinsicInfo *Intr,
9523 SelectionDAG &DAG, bool WithChain) const {
9524 SDLoc DL(Op);
9525 MachineFunction &MF = DAG.getMachineFunction();
9526 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9527 unsigned IntrOpcode = Intr->BaseOpcode;
9528 // For image atomic: use no-return opcode if result is unused.
9529 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9530 !Op.getNode()->hasAnyUseOfValue(Value: 0))
9531 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9532 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9533 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: IntrOpcode);
9534 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
9535 bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget);
9536 bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
9537 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
9538
9539 SmallVector<EVT, 3> ResultTypes(Op->values());
9540 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9541 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9542 ResultTypes.erase(CI: &ResultTypes[0]);
9543
9544 bool IsD16 = false;
9545 bool IsG16 = false;
9546 bool IsA16 = false;
9547 SDValue VData;
9548 int NumVDataDwords = 0;
9549 bool AdjustRetType = false;
9550 bool IsAtomicPacked16Bit = false;
9551
9552 // Offset of intrinsic arguments
9553 const unsigned ArgOffset = WithChain ? 2 : 1;
9554
9555 unsigned DMask;
9556 unsigned DMaskLanes = 0;
9557
9558 if (BaseOpcode->Atomic) {
9559 VData = Op.getOperand(i: 2);
9560
9561 IsAtomicPacked16Bit =
9562 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9563 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9564 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9565 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9566
9567 bool Is64Bit = VData.getValueSizeInBits() == 64;
9568 if (BaseOpcode->AtomicX2) {
9569 SDValue VData2 = Op.getOperand(i: 3);
9570 VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9571 Ops: {VData, VData2});
9572 if (Is64Bit)
9573 VData = DAG.getBitcast(VT: MVT::v4i32, V: VData);
9574
9575 if (!BaseOpcode->NoReturn)
9576 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9577
9578 DMask = Is64Bit ? 0xf : 0x3;
9579 NumVDataDwords = Is64Bit ? 4 : 2;
9580 } else {
9581 DMask = Is64Bit ? 0x3 : 0x1;
9582 NumVDataDwords = Is64Bit ? 2 : 1;
9583 }
9584 } else {
9585 DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
9586 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
9587
9588 if (BaseOpcode->Store) {
9589 VData = Op.getOperand(i: 2);
9590
9591 MVT StoreVT = VData.getSimpleValueType();
9592 if (StoreVT.getScalarType() == MVT::f16) {
9593 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9594 return Op; // D16 is unsupported for this instruction
9595
9596 IsD16 = true;
9597 VData = handleD16VData(VData, DAG, ImageStore: true);
9598 }
9599
9600 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9601 } else if (!BaseOpcode->NoReturn) {
9602 // Work out the num dwords based on the dmask popcount and underlying type
9603 // and whether packing is supported.
9604 MVT LoadVT = ResultTypes[0].getSimpleVT();
9605 if (LoadVT.getScalarType() == MVT::f16) {
9606 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9607 return Op; // D16 is unsupported for this instruction
9608
9609 IsD16 = true;
9610 }
9611
9612 // Confirm that the return type is large enough for the dmask specified
9613 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9614 (!LoadVT.isVector() && DMaskLanes > 1))
9615 return Op;
9616
9617 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9618 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9619 // instructions.
9620 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9621 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9622 NumVDataDwords = (DMaskLanes + 1) / 2;
9623 else
9624 NumVDataDwords = DMaskLanes;
9625
9626 AdjustRetType = true;
9627 }
9628 }
9629
9630 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9631 SmallVector<SDValue, 4> VAddrs;
9632
9633 // Check for 16 bit addresses or derivatives and pack if true.
9634 MVT VAddrVT =
9635 Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
9636 MVT VAddrScalarVT = VAddrVT.getScalarType();
9637 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9638 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9639
9640 VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
9641 VAddrScalarVT = VAddrVT.getScalarType();
9642 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9643 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9644
9645 // Push back extra arguments.
9646 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9647 if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) {
9648 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9649 // Special handling of bias when A16 is on. Bias is of type half but
9650 // occupies full 32-bit.
9651 SDValue Bias = DAG.getBuildVector(
9652 VT: MVT::v2f16, DL,
9653 Ops: {Op.getOperand(i: ArgOffset + I), DAG.getPOISON(VT: MVT::f16)});
9654 VAddrs.push_back(Elt: Bias);
9655 } else {
9656 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9657 "Bias needs to be converted to 16 bit in A16 mode");
9658 VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
9659 }
9660 }
9661
9662 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9663 // 16 bit gradients are supported, but are tied to the A16 control
9664 // so both gradients and addresses must be 16 bit
9665 LLVM_DEBUG(
9666 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9667 "require 16 bit args for both gradients and addresses");
9668 return Op;
9669 }
9670
9671 if (IsA16) {
9672 if (!ST->hasA16()) {
9673 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9674 "support 16 bit addresses\n");
9675 return Op;
9676 }
9677 }
9678
9679 // We've dealt with incorrect input so we know that if IsA16, IsG16
9680 // are set then we have to compress/pack operands (either address,
9681 // gradient or both)
9682 // In the case where a16 and gradients are tied (no G16 support) then we
9683 // have already verified that both IsA16 and IsG16 are true
9684 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9685 // Activate g16
9686 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9687 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
9688 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9689 }
9690
9691 // Add gradients (packed or unpacked)
9692 if (IsG16) {
9693 // Pack the gradients
9694 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9695 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
9696 DimIdx: ArgOffset + Intr->GradientStart,
9697 EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
9698 } else {
9699 for (unsigned I = ArgOffset + Intr->GradientStart;
9700 I < ArgOffset + Intr->CoordStart; I++)
9701 VAddrs.push_back(Elt: Op.getOperand(i: I));
9702 }
9703
9704 // Add addresses (packed or unpacked)
9705 if (IsA16) {
9706 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
9707 DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
9708 NumGradients: 0 /* No gradients */);
9709 } else {
9710 // Add uncompressed address
9711 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9712 VAddrs.push_back(Elt: Op.getOperand(i: I));
9713 }
9714
9715 // If the register allocator cannot place the address registers contiguously
9716 // without introducing moves, then using the non-sequential address encoding
9717 // is always preferable, since it saves VALU instructions and is usually a
9718 // wash in terms of code size or even better.
9719 //
9720 // However, we currently have no way of hinting to the register allocator that
9721 // MIMG addresses should be placed contiguously when it is possible to do so,
9722 // so force non-NSA for the common 2-address case as a heuristic.
9723 //
9724 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9725 // allocation when possible.
9726 //
9727 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9728 // set of the remaining addresses.
9729 const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
9730 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9731 const bool UseNSA = ST->hasNSAEncoding() &&
9732 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9733 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9734 const bool UsePartialNSA =
9735 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9736
9737 SDValue VAddr;
9738 if (UsePartialNSA) {
9739 VAddr = getBuildDwordsVector(DAG, DL,
9740 Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1));
9741 } else if (!UseNSA) {
9742 VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
9743 }
9744
9745 SDValue True = DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1);
9746 SDValue False = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1);
9747 SDValue Unorm;
9748 if (!BaseOpcode->Sampler) {
9749 Unorm = True;
9750 } else {
9751 uint64_t UnormConst =
9752 Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
9753
9754 Unorm = UnormConst ? True : False;
9755 }
9756
9757 SDValue TFE;
9758 SDValue LWE;
9759 SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
9760 bool IsTexFail = false;
9761 if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
9762 return Op;
9763
9764 if (IsTexFail) {
9765 if (!DMaskLanes) {
9766 // Expecting to get an error flag since TFC is on - and dmask is 0
9767 // Force dmask to be at least 1 otherwise the instruction will fail
9768 DMask = 0x1;
9769 DMaskLanes = 1;
9770 NumVDataDwords = 1;
9771 }
9772 NumVDataDwords += 1;
9773 AdjustRetType = true;
9774 }
9775
9776 // Has something earlier tagged that the return type needs adjusting
9777 // This happens if the instruction is a load or has set TexFailCtrl flags
9778 if (AdjustRetType) {
9779 // NumVDataDwords reflects the true number of dwords required in the return
9780 // type
9781 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9782 // This is a no-op load. This can be eliminated
9783 SDValue Undef = DAG.getPOISON(VT: Op.getValueType());
9784 if (isa<MemSDNode>(Val: Op))
9785 return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL);
9786 return Undef;
9787 }
9788
9789 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(Context&: *DAG.getContext(),
9790 VT: MVT::i32, NumElements: NumVDataDwords)
9791 : MVT::i32;
9792
9793 ResultTypes[0] = NewVT;
9794 if (ResultTypes.size() == 3) {
9795 // Original result was aggregate type used for TexFailCtrl results
9796 // The actual instruction returns as a vector type which has now been
9797 // created. Remove the aggregate result.
9798 ResultTypes.erase(CI: &ResultTypes[1]);
9799 }
9800 }
9801
9802 unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
9803 // Keep GLC only when the atomic's result is actually used.
9804 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9805 CPol |= AMDGPU::CPol::GLC;
9806 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9807 AMDGPU::CPol::VOLATILE))
9808 return Op;
9809
9810 SmallVector<SDValue, 26> Ops;
9811 if (BaseOpcode->Store || BaseOpcode->Atomic)
9812 Ops.push_back(Elt: VData); // vdata
9813 if (UsePartialNSA) {
9814 append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1));
9815 Ops.push_back(Elt: VAddr);
9816 } else if (UseNSA)
9817 append_range(C&: Ops, R&: VAddrs);
9818 else
9819 Ops.push_back(Elt: VAddr);
9820 SDValue Rsrc = Op.getOperand(i: ArgOffset + Intr->RsrcIndex);
9821 EVT RsrcVT = Rsrc.getValueType();
9822 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9823 return Op;
9824 Ops.push_back(Elt: Rsrc);
9825 if (BaseOpcode->Sampler) {
9826 SDValue Samp = Op.getOperand(i: ArgOffset + Intr->SampIndex);
9827 if (Samp.getValueType() != MVT::v4i32)
9828 return Op;
9829 Ops.push_back(Elt: Samp);
9830 }
9831 Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32));
9832 if (IsGFX10Plus)
9833 Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32));
9834 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9835 Ops.push_back(Elt: Unorm);
9836 Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32));
9837 Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9
9838 ST->hasFeature(Feature: AMDGPU::FeatureR128A16)
9839 ? True
9840 : False);
9841 if (IsGFX10Plus)
9842 Ops.push_back(Elt: IsA16 ? True : False);
9843
9844 if (!Subtarget->hasGFX90AInsts())
9845 Ops.push_back(Elt: TFE); // tfe
9846 else if (TFE->getAsZExtVal()) {
9847 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9848 DAG.getMachineFunction().getFunction(),
9849 "TFE is not supported on this GPU", DL.getDebugLoc()));
9850 }
9851
9852 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9853 Ops.push_back(Elt: LWE); // lwe
9854 if (!IsGFX10Plus)
9855 Ops.push_back(Elt: DimInfo->DA ? True : False);
9856 if (BaseOpcode->HasD16)
9857 Ops.push_back(Elt: IsD16 ? True : False);
9858 if (isa<MemSDNode>(Val: Op))
9859 Ops.push_back(Elt: Op.getOperand(i: 0)); // chain
9860
9861 int NumVAddrDwords =
9862 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9863 int Opcode = -1;
9864
9865 if (IsGFX12Plus) {
9866 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
9867 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9868 } else if (IsGFX11Plus) {
9869 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
9870 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
9871 : AMDGPU::MIMGEncGfx11Default,
9872 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9873 } else if (IsGFX10Plus) {
9874 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
9875 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
9876 : AMDGPU::MIMGEncGfx10Default,
9877 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9878 } else {
9879 if (Subtarget->hasGFX90AInsts()) {
9880 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
9881 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9882 if (Opcode == -1) {
9883 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9884 DAG.getMachineFunction().getFunction(),
9885 "requested image instruction is not supported on this GPU",
9886 DL.getDebugLoc()));
9887
9888 unsigned Idx = 0;
9889 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9890 for (EVT VT : OrigResultTypes) {
9891 if (VT == MVT::Other)
9892 RetValues[Idx++] = Op.getOperand(i: 0); // Chain
9893 else
9894 RetValues[Idx++] = DAG.getPOISON(VT);
9895 }
9896
9897 return DAG.getMergeValues(Ops: RetValues, dl: DL);
9898 }
9899 }
9900 if (Opcode == -1 &&
9901 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9902 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
9903 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9904 if (Opcode == -1)
9905 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
9906 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9907 }
9908 if (Opcode == -1)
9909 return Op;
9910
9911 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
9912 if (auto *MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
9913 MachineMemOperand *MemRef = MemOp->getMemOperand();
9914 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9915 }
9916
9917 if (BaseOpcode->NoReturn) {
9918 if (BaseOpcode->Atomic)
9919 return DAG.getMergeValues(
9920 Ops: {DAG.getPOISON(VT: OrigResultTypes[0]), SDValue(NewNode, 0)}, dl: DL);
9921
9922 return SDValue(NewNode, 0);
9923 }
9924
9925 if (BaseOpcode->AtomicX2) {
9926 SmallVector<SDValue, 1> Elt;
9927 DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1);
9928 return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL);
9929 }
9930
9931 return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
9932 Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
9933 NumVDataDwords, IsAtomicPacked16Bit, DL);
9934}
9935
9936SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9937 SDValue Offset, SDValue CachePolicy,
9938 SelectionDAG &DAG) const {
9939 MachineFunction &MF = DAG.getMachineFunction();
9940
9941 const DataLayout &DataLayout = DAG.getDataLayout();
9942 Align Alignment =
9943 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
9944
9945 MachineMemOperand *MMO = MF.getMachineMemOperand(
9946 PtrInfo: MachinePointerInfo(),
9947 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
9948 MachineMemOperand::MOInvariant,
9949 Size: VT.getStoreSize(), BaseAlignment: Alignment);
9950
9951 if (!Offset->isDivergent()) {
9952 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9953
9954 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9955 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9956 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9957 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9958 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9959 SDValue BufferLoad =
9960 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL,
9961 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
9962 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
9963 }
9964
9965 // Widen vec3 load to vec4.
9966 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9967 !Subtarget->hasScalarDwordx3Loads()) {
9968 EVT WidenedVT =
9969 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
9970 auto WidenedOp = DAG.getMemIntrinsicNode(
9971 Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
9972 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize()));
9973 auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
9974 N2: DAG.getVectorIdxConstant(Val: 0, DL));
9975 return Subvector;
9976 }
9977
9978 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
9979 VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
9980 }
9981
9982 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9983 // assume that the buffer is unswizzled.
9984 SDValue Ops[] = {
9985 DAG.getEntryNode(), // Chain
9986 Rsrc, // rsrc
9987 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
9988 {}, // voffset
9989 {}, // soffset
9990 {}, // offset
9991 CachePolicy, // cachepolicy
9992 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
9993 };
9994 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9995 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
9996 return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
9997 }
9998
9999 SmallVector<SDValue, 4> Loads;
10000 unsigned NumLoads = 1;
10001 MVT LoadVT = VT.getSimpleVT();
10002 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10003 assert((LoadVT.getScalarType() == MVT::i32 ||
10004 LoadVT.getScalarType() == MVT::f32));
10005
10006 if (NumElts == 8 || NumElts == 16) {
10007 NumLoads = NumElts / 4;
10008 LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4);
10009 }
10010
10011 SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Other});
10012
10013 // Use the alignment to ensure that the required offsets will fit into the
10014 // immediate offsets.
10015 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3],
10016 Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10017
10018 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10019 for (unsigned i = 0; i < NumLoads; ++i) {
10020 Ops[5] = DAG.getTargetConstant(Val: InstOffset + 16 * i, DL, VT: MVT::i32);
10021 Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10022 MemVT: LoadVT, MMO, DAG));
10023 }
10024
10025 if (NumElts == 8 || NumElts == 16)
10026 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
10027
10028 return Loads[0];
10029}
10030
10031SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10032 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10033 if (!Subtarget->hasArchitectedSGPRs())
10034 return {};
10035 SDLoc SL(Op);
10036 MVT VT = MVT::i32;
10037 SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT);
10038 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
10039 N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT));
10040}
10041
10042SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10043 AMDGPU::Hwreg::Id HwReg,
10044 unsigned LowBit,
10045 unsigned Width) const {
10046 SDLoc SL(Op);
10047 using namespace AMDGPU::Hwreg;
10048 return {DAG.getMachineNode(
10049 Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT: MVT::i32,
10050 Op1: DAG.getTargetConstant(Val: HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width),
10051 DL: SL, VT: MVT::i32)),
10052 0};
10053}
10054
10055SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10056 unsigned Dim,
10057 const ArgDescriptor &Arg) const {
10058 SDLoc SL(Op);
10059 MachineFunction &MF = DAG.getMachineFunction();
10060 unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim);
10061 if (MaxID == 0)
10062 return DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
10063
10064 // It's undefined behavior if a function marked with the amdgpu-no-*
10065 // attributes uses the corresponding intrinsic.
10066 if (!Arg)
10067 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
10068
10069 SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32,
10070 SL: SDLoc(DAG.getEntryNode()), Arg);
10071
10072 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10073 // masking operations anyway.
10074 //
10075 // TODO: We could assert the top bit is 0 for the source copy.
10076 if (Arg.isMasked())
10077 return Val;
10078
10079 // Preserve the known bits after expansion to a copy.
10080 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
10081 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val,
10082 N2: DAG.getValueType(SmallVT));
10083}
10084
10085SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10086 SelectionDAG &DAG) const {
10087 MachineFunction &MF = DAG.getMachineFunction();
10088 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10089
10090 EVT VT = Op.getValueType();
10091 SDLoc DL(Op);
10092 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
10093
10094 // TODO: Should this propagate fast-math-flags?
10095
10096 switch (IntrinsicID) {
10097 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10098 if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction()))
10099 return emitNonHSAIntrinsicError(DAG, DL, VT);
10100 return getPreloadedValue(DAG, MFI: *MFI, VT,
10101 PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
10102 }
10103 case Intrinsic::amdgcn_dispatch_ptr:
10104 case Intrinsic::amdgcn_queue_ptr: {
10105 if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) {
10106 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10107 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10108 DL.getDebugLoc()));
10109 return DAG.getPOISON(VT);
10110 }
10111
10112 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10113 ? AMDGPUFunctionArgInfo::DISPATCH_PTR
10114 : AMDGPUFunctionArgInfo::QUEUE_PTR;
10115 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
10116 }
10117 case Intrinsic::amdgcn_implicitarg_ptr: {
10118 if (MFI->isEntryFunction())
10119 return getImplicitArgPtr(DAG, SL: DL);
10120 return getPreloadedValue(DAG, MFI: *MFI, VT,
10121 PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
10122 }
10123 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10124 if (!AMDGPU::isKernel(F: MF.getFunction())) {
10125 // This only makes sense to call in a kernel, so just lower to null.
10126 return DAG.getConstant(Val: 0, DL, VT);
10127 }
10128
10129 return getPreloadedValue(DAG, MFI: *MFI, VT,
10130 PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
10131 }
10132 case Intrinsic::amdgcn_dispatch_id: {
10133 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
10134 }
10135 case Intrinsic::amdgcn_rcp:
10136 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1));
10137 case Intrinsic::amdgcn_rsq:
10138 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
10139 case Intrinsic::amdgcn_rsq_legacy:
10140 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10141 return emitRemovedIntrinsicError(DAG, DL, VT);
10142 return SDValue();
10143 case Intrinsic::amdgcn_rcp_legacy:
10144 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10145 return emitRemovedIntrinsicError(DAG, DL, VT);
10146 return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1));
10147 case Intrinsic::amdgcn_rsq_clamp: {
10148 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10149 return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1));
10150
10151 Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext());
10152 APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
10153 APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
10154
10155 SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
10156 SDValue Tmp =
10157 DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, N2: DAG.getConstantFP(Val: Max, DL, VT));
10158 return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
10159 N2: DAG.getConstantFP(Val: Min, DL, VT));
10160 }
10161 case Intrinsic::r600_read_ngroups_x:
10162 if (Subtarget->isAmdHsaOS())
10163 return emitNonHSAIntrinsicError(DAG, DL, VT);
10164
10165 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10166 Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4),
10167 Signed: false);
10168 case Intrinsic::r600_read_ngroups_y:
10169 if (Subtarget->isAmdHsaOS())
10170 return emitNonHSAIntrinsicError(DAG, DL, VT);
10171
10172 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10173 Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4),
10174 Signed: false);
10175 case Intrinsic::r600_read_ngroups_z:
10176 if (Subtarget->isAmdHsaOS())
10177 return emitNonHSAIntrinsicError(DAG, DL, VT);
10178
10179 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10180 Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4),
10181 Signed: false);
10182 case Intrinsic::r600_read_local_size_x:
10183 if (Subtarget->isAmdHsaOS())
10184 return emitNonHSAIntrinsicError(DAG, DL, VT);
10185
10186 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10187 Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
10188 case Intrinsic::r600_read_local_size_y:
10189 if (Subtarget->isAmdHsaOS())
10190 return emitNonHSAIntrinsicError(DAG, DL, VT);
10191
10192 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10193 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
10194 case Intrinsic::r600_read_local_size_z:
10195 if (Subtarget->isAmdHsaOS())
10196 return emitNonHSAIntrinsicError(DAG, DL, VT);
10197
10198 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10199 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
10200 case Intrinsic::amdgcn_workgroup_id_x:
10201 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10202 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
10203 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
10204 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
10205 case Intrinsic::amdgcn_workgroup_id_y:
10206 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10207 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
10208 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
10209 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
10210 case Intrinsic::amdgcn_workgroup_id_z:
10211 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10212 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
10213 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
10214 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
10215 case Intrinsic::amdgcn_cluster_id_x:
10216 return Subtarget->hasClusters()
10217 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10218 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X)
10219 : DAG.getPOISON(VT);
10220 case Intrinsic::amdgcn_cluster_id_y:
10221 return Subtarget->hasClusters()
10222 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10223 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y)
10224 : DAG.getPOISON(VT);
10225 case Intrinsic::amdgcn_cluster_id_z:
10226 return Subtarget->hasClusters()
10227 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10228 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z)
10229 : DAG.getPOISON(VT);
10230 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10231 return Subtarget->hasClusters()
10232 ? getPreloadedValue(
10233 DAG, MFI: *MFI, VT,
10234 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X)
10235 : DAG.getPOISON(VT);
10236 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10237 return Subtarget->hasClusters()
10238 ? getPreloadedValue(
10239 DAG, MFI: *MFI, VT,
10240 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y)
10241 : DAG.getPOISON(VT);
10242 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10243 return Subtarget->hasClusters()
10244 ? getPreloadedValue(
10245 DAG, MFI: *MFI, VT,
10246 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z)
10247 : DAG.getPOISON(VT);
10248 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10249 return Subtarget->hasClusters()
10250 ? lowerConstHwRegRead(DAG, Op, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4)
10251 : SDValue();
10252 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10253 return Subtarget->hasClusters()
10254 ? getPreloadedValue(
10255 DAG, MFI: *MFI, VT,
10256 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X)
10257 : DAG.getPOISON(VT);
10258 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10259 return Subtarget->hasClusters()
10260 ? getPreloadedValue(
10261 DAG, MFI: *MFI, VT,
10262 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y)
10263 : DAG.getPOISON(VT);
10264 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10265 return Subtarget->hasClusters()
10266 ? getPreloadedValue(
10267 DAG, MFI: *MFI, VT,
10268 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z)
10269 : DAG.getPOISON(VT);
10270 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10271 return Subtarget->hasClusters()
10272 ? getPreloadedValue(
10273 DAG, MFI: *MFI, VT,
10274 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID)
10275 : DAG.getPOISON(VT);
10276 case Intrinsic::amdgcn_wave_id:
10277 return lowerWaveID(DAG, Op);
10278 case Intrinsic::amdgcn_lds_kernel_id: {
10279 if (MFI->isEntryFunction())
10280 return getLDSKernelId(DAG, SL: DL);
10281 return getPreloadedValue(DAG, MFI: *MFI, VT,
10282 PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
10283 }
10284 case Intrinsic::amdgcn_workitem_id_x:
10285 return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX);
10286 case Intrinsic::amdgcn_workitem_id_y:
10287 return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY);
10288 case Intrinsic::amdgcn_workitem_id_z:
10289 return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ);
10290 case Intrinsic::amdgcn_wavefrontsize:
10291 return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10292 DL: SDLoc(Op), VT: MVT::i32);
10293 case Intrinsic::amdgcn_s_buffer_load: {
10294 unsigned CPol = Op.getConstantOperandVal(i: 3);
10295 // s_buffer_load, because of how it's optimized, can't be volatile
10296 // so reject ones with the volatile bit set.
10297 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10298 ? AMDGPU::CPol::ALL
10299 : AMDGPU::CPol::ALL_pregfx12))
10300 return Op;
10301 return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2),
10302 CachePolicy: Op.getOperand(i: 3), DAG);
10303 }
10304 case Intrinsic::amdgcn_fdiv_fast:
10305 return lowerFDIV_FAST(Op, DAG);
10306 case Intrinsic::amdgcn_sin:
10307 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1));
10308
10309 case Intrinsic::amdgcn_cos:
10310 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1));
10311
10312 case Intrinsic::amdgcn_mul_u24:
10313 return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1),
10314 N2: Op.getOperand(i: 2));
10315 case Intrinsic::amdgcn_mul_i24:
10316 return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1),
10317 N2: Op.getOperand(i: 2));
10318
10319 case Intrinsic::amdgcn_log_clamp: {
10320 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10321 return SDValue();
10322
10323 return emitRemovedIntrinsicError(DAG, DL, VT);
10324 }
10325 case Intrinsic::amdgcn_fract:
10326 return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1));
10327
10328 case Intrinsic::amdgcn_class:
10329 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, N1: Op.getOperand(i: 1),
10330 N2: Op.getOperand(i: 2));
10331 case Intrinsic::amdgcn_div_fmas:
10332 return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, N1: Op.getOperand(i: 1),
10333 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
10334
10335 case Intrinsic::amdgcn_div_fixup:
10336 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, N1: Op.getOperand(i: 1),
10337 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10338
10339 case Intrinsic::amdgcn_div_scale: {
10340 const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
10341
10342 // Translate to the operands expected by the machine instruction. The
10343 // first parameter must be the same as the first instruction.
10344 SDValue Numerator = Op.getOperand(i: 1);
10345 SDValue Denominator = Op.getOperand(i: 2);
10346
10347 // Note this order is opposite of the machine instruction's operations,
10348 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10349 // intrinsic has the numerator as the first operand to match a normal
10350 // division operation.
10351
10352 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10353
10354 return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0,
10355 N2: Denominator, N3: Numerator);
10356 }
10357 case Intrinsic::amdgcn_icmp: {
10358 // There is a Pat that handles this variant, so return it as-is.
10359 if (Op.getOperand(i: 1).getValueType() == MVT::i1 &&
10360 Op.getConstantOperandVal(i: 2) == 0 &&
10361 Op.getConstantOperandVal(i: 3) == ICmpInst::Predicate::ICMP_NE)
10362 return Op;
10363 return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10364 }
10365 case Intrinsic::amdgcn_fcmp: {
10366 return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10367 }
10368 case Intrinsic::amdgcn_ballot:
10369 return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10370 case Intrinsic::amdgcn_fmed3:
10371 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, N1: Op.getOperand(i: 1),
10372 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10373 case Intrinsic::amdgcn_fdot2:
10374 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, N1: Op.getOperand(i: 1),
10375 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
10376 case Intrinsic::amdgcn_fmul_legacy:
10377 return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, N1: Op.getOperand(i: 1),
10378 N2: Op.getOperand(i: 2));
10379 case Intrinsic::amdgcn_sffbh:
10380 return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: 1));
10381 case Intrinsic::amdgcn_sbfe:
10382 return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, N1: Op.getOperand(i: 1),
10383 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10384 case Intrinsic::amdgcn_ubfe:
10385 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, N1: Op.getOperand(i: 1),
10386 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10387 case Intrinsic::amdgcn_cvt_pkrtz:
10388 case Intrinsic::amdgcn_cvt_pknorm_i16:
10389 case Intrinsic::amdgcn_cvt_pknorm_u16:
10390 case Intrinsic::amdgcn_cvt_pk_i16:
10391 case Intrinsic::amdgcn_cvt_pk_u16: {
10392 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10393 EVT VT = Op.getValueType();
10394 unsigned Opcode;
10395
10396 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10397 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10398 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10399 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10400 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10401 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10402 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10403 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10404 else
10405 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10406
10407 if (isTypeLegal(VT))
10408 return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
10409
10410 SDValue Node =
10411 DAG.getNode(Opcode, DL, VT: MVT::i32, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
10412 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
10413 }
10414 case Intrinsic::amdgcn_fmad_ftz:
10415 return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1),
10416 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10417
10418 case Intrinsic::amdgcn_if_break:
10419 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT,
10420 Op1: Op->getOperand(Num: 1), Op2: Op->getOperand(Num: 2)),
10421 0);
10422
10423 case Intrinsic::amdgcn_groupstaticsize: {
10424 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
10425 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10426 return Op;
10427
10428 const Module *M = MF.getFunction().getParent();
10429 const GlobalValue *GV =
10430 Intrinsic::getDeclarationIfExists(M, id: Intrinsic::amdgcn_groupstaticsize);
10431 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: 0,
10432 TargetFlags: SIInstrInfo::MO_ABS32_LO);
10433 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
10434 }
10435 case Intrinsic::amdgcn_is_shared:
10436 case Intrinsic::amdgcn_is_private: {
10437 SDLoc SL(Op);
10438 SDValue SrcVec =
10439 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
10440 SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec,
10441 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
10442
10443 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10444 ? AMDGPUAS::LOCAL_ADDRESS
10445 : AMDGPUAS::PRIVATE_ADDRESS;
10446 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10447 Subtarget->hasGloballyAddressableScratch()) {
10448 SDValue FlatScratchBaseHi(
10449 DAG.getMachineNode(
10450 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
10451 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, VT: MVT::i32)),
10452 0);
10453 // Test bits 63..58 against the aperture address.
10454 return DAG.getSetCC(
10455 DL: SL, VT: MVT::i1,
10456 LHS: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: SrcHi, N2: FlatScratchBaseHi),
10457 RHS: DAG.getConstant(Val: 1u << 26, DL: SL, VT: MVT::i32), Cond: ISD::SETULT);
10458 }
10459
10460 SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
10461 return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ);
10462 }
10463 case Intrinsic::amdgcn_perm:
10464 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: 1),
10465 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10466 case Intrinsic::amdgcn_reloc_constant: {
10467 Module *M = MF.getFunction().getParent();
10468 const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD();
10469 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
10470 auto *RelocSymbol = cast<GlobalVariable>(
10471 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
10472 SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: 0,
10473 TargetFlags: SIInstrInfo::MO_ABS32_LO);
10474 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
10475 }
10476 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10477 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10478 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10479 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10480 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10481 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10482 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10483 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10484 if (Op.getOperand(i: 4).getValueType() == MVT::i32)
10485 return SDValue();
10486
10487 SDLoc SL(Op);
10488 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i32);
10489 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10490 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2),
10491 N4: Op.getOperand(i: 3), N5: IndexKeyi32);
10492 }
10493 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10494 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10495 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10496 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10497 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10498 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10499 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10500 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10501 if (Op.getOperand(i: 4).getValueType() == MVT::i64)
10502 return SDValue();
10503
10504 SDLoc SL(Op);
10505 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i64);
10506 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10507 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10508 Op.getOperand(i: 3), IndexKeyi64, Op.getOperand(i: 5),
10509 Op.getOperand(i: 6)});
10510 }
10511 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10512 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10513 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10514 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10515 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10516 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10517 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10518 ? MVT::i64
10519 : MVT::i32;
10520 if (Op.getOperand(i: 6).getValueType() == IndexKeyTy)
10521 return SDValue();
10522
10523 SDLoc SL(Op);
10524 auto IndexKey = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: IndexKeyTy);
10525 SmallVector<SDValue> Args{
10526 Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10527 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
10528 IndexKey, Op.getOperand(i: 7), Op.getOperand(i: 8)};
10529 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10530 Args.push_back(Elt: Op.getOperand(i: 9));
10531 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(), Ops: Args);
10532 }
10533 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10534 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10535 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10536 if (Op.getOperand(i: 6).getValueType() == MVT::i32)
10537 return SDValue();
10538
10539 SDLoc SL(Op);
10540 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: MVT::i32);
10541 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10542 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10543 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
10544 IndexKeyi32, Op.getOperand(i: 7)});
10545 }
10546 case Intrinsic::amdgcn_addrspacecast_nonnull:
10547 return lowerADDRSPACECAST(Op, DAG);
10548 case Intrinsic::amdgcn_readlane:
10549 case Intrinsic::amdgcn_readfirstlane:
10550 case Intrinsic::amdgcn_writelane:
10551 case Intrinsic::amdgcn_permlane16:
10552 case Intrinsic::amdgcn_permlanex16:
10553 case Intrinsic::amdgcn_permlane64:
10554 case Intrinsic::amdgcn_set_inactive:
10555 case Intrinsic::amdgcn_set_inactive_chain_arg:
10556 case Intrinsic::amdgcn_mov_dpp8:
10557 case Intrinsic::amdgcn_update_dpp:
10558 return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG);
10559 case Intrinsic::amdgcn_dead: {
10560 SmallVector<SDValue, 8> Poisons;
10561 for (const EVT ValTy : Op.getNode()->values())
10562 Poisons.push_back(Elt: DAG.getPOISON(VT: ValTy));
10563 return DAG.getMergeValues(Ops: Poisons, dl: SDLoc(Op));
10564 }
10565 case Intrinsic::amdgcn_wave_shuffle:
10566 return lowerWaveShuffle(TLI: *this, N: Op.getNode(), DAG);
10567 default:
10568 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10569 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
10570 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
10571
10572 return Op;
10573 }
10574}
10575
10576// On targets not supporting constant in soffset field, turn zero to
10577// SGPR_NULL to avoid generating an extra s_mov with zero.
10578static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
10579 const GCNSubtarget *Subtarget) {
10580 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset))
10581 return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
10582 return SOffset;
10583}
10584
10585SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10586 SelectionDAG &DAG,
10587 unsigned NewOpcode) const {
10588 SDLoc DL(Op);
10589
10590 SDValue VData = Op.getOperand(i: 2);
10591 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10592 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10593 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10594 SDValue Ops[] = {
10595 Op.getOperand(i: 0), // Chain
10596 VData, // vdata
10597 Rsrc, // rsrc
10598 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10599 VOffset, // voffset
10600 SOffset, // soffset
10601 Offset, // offset
10602 Op.getOperand(i: 6), // cachepolicy
10603 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10604 };
10605
10606 auto *M = cast<MemSDNode>(Val&: Op);
10607
10608 EVT MemVT = VData.getValueType();
10609 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
10610 MMO: M->getMemOperand());
10611}
10612
10613SDValue
10614SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10615 unsigned NewOpcode) const {
10616 SDLoc DL(Op);
10617
10618 SDValue VData = Op.getOperand(i: 2);
10619 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10620 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
10621 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
10622 SDValue Ops[] = {
10623 Op.getOperand(i: 0), // Chain
10624 VData, // vdata
10625 Rsrc, // rsrc
10626 Op.getOperand(i: 4), // vindex
10627 VOffset, // voffset
10628 SOffset, // soffset
10629 Offset, // offset
10630 Op.getOperand(i: 7), // cachepolicy
10631 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10632 };
10633
10634 auto *M = cast<MemSDNode>(Val&: Op);
10635
10636 EVT MemVT = VData.getValueType();
10637 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
10638 MMO: M->getMemOperand());
10639}
10640
10641SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10642 SelectionDAG &DAG) const {
10643 unsigned IntrID = Op.getConstantOperandVal(i: 1);
10644 SDLoc DL(Op);
10645
10646 switch (IntrID) {
10647 case Intrinsic::amdgcn_ds_ordered_add:
10648 case Intrinsic::amdgcn_ds_ordered_swap: {
10649 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10650 SDValue Chain = M->getOperand(Num: 0);
10651 SDValue M0 = M->getOperand(Num: 2);
10652 SDValue Value = M->getOperand(Num: 3);
10653 unsigned IndexOperand = M->getConstantOperandVal(Num: 7);
10654 unsigned WaveRelease = M->getConstantOperandVal(Num: 8);
10655 unsigned WaveDone = M->getConstantOperandVal(Num: 9);
10656
10657 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10658 IndexOperand &= ~0x3f;
10659 unsigned CountDw = 0;
10660
10661 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10662 CountDw = (IndexOperand >> 24) & 0xf;
10663 IndexOperand &= ~(0xf << 24);
10664
10665 if (CountDw < 1 || CountDw > 4) {
10666 const Function &Fn = DAG.getMachineFunction().getFunction();
10667 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10668 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10669 DL.getDebugLoc()));
10670 CountDw = 1;
10671 }
10672 }
10673
10674 if (IndexOperand) {
10675 const Function &Fn = DAG.getMachineFunction().getFunction();
10676 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10677 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10678 }
10679
10680 if (WaveDone && !WaveRelease) {
10681 // TODO: Move this to IR verifier
10682 const Function &Fn = DAG.getMachineFunction().getFunction();
10683 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10684 Fn, "ds_ordered_count: wave_done requires wave_release",
10685 DL.getDebugLoc()));
10686 }
10687
10688 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10689 unsigned ShaderType =
10690 SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
10691 unsigned Offset0 = OrderedCountIndex << 2;
10692 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10693
10694 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10695 Offset1 |= (CountDw - 1) << 6;
10696
10697 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10698 Offset1 |= ShaderType << 2;
10699
10700 unsigned Offset = Offset0 | (Offset1 << 8);
10701
10702 SDValue Ops[] = {
10703 Chain, Value, DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16),
10704 copyToM0(DAG, Chain, DL, V: M0).getValue(R: 1), // Glue
10705 };
10706 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL,
10707 VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
10708 MMO: M->getMemOperand());
10709 }
10710 case Intrinsic::amdgcn_raw_buffer_load:
10711 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10712 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10713 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10714 case Intrinsic::amdgcn_raw_buffer_load_format:
10715 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10716 const bool IsFormat =
10717 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10718 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10719
10720 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10721 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
10722 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
10723 SDValue Ops[] = {
10724 Op.getOperand(i: 0), // Chain
10725 Rsrc, // rsrc
10726 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10727 VOffset, // voffset
10728 SOffset, // soffset
10729 Offset, // offset
10730 Op.getOperand(i: 5), // cachepolicy, swizzled buffer
10731 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10732 };
10733
10734 auto *M = cast<MemSDNode>(Val&: Op);
10735 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10736 }
10737 case Intrinsic::amdgcn_struct_buffer_load:
10738 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10739 case Intrinsic::amdgcn_struct_buffer_load_format:
10740 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10741 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10742 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10743 const bool IsFormat =
10744 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10745 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10746
10747 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10748 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10749 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10750 SDValue Ops[] = {
10751 Op.getOperand(i: 0), // Chain
10752 Rsrc, // rsrc
10753 Op.getOperand(i: 3), // vindex
10754 VOffset, // voffset
10755 SOffset, // soffset
10756 Offset, // offset
10757 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
10758 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10759 };
10760
10761 return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
10762 }
10763 case Intrinsic::amdgcn_raw_tbuffer_load:
10764 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10765 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10766 EVT LoadVT = Op.getValueType();
10767 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10768 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
10769 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
10770
10771 SDValue Ops[] = {
10772 Op.getOperand(i: 0), // Chain
10773 Rsrc, // rsrc
10774 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10775 VOffset, // voffset
10776 SOffset, // soffset
10777 Offset, // offset
10778 Op.getOperand(i: 5), // format
10779 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
10780 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10781 };
10782
10783 if (LoadVT.getScalarType() == MVT::f16)
10784 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10785 Ops);
10786 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10787 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
10788 DAG);
10789 }
10790 case Intrinsic::amdgcn_struct_tbuffer_load:
10791 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10792 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10793 EVT LoadVT = Op.getValueType();
10794 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10795 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10796 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10797
10798 SDValue Ops[] = {
10799 Op.getOperand(i: 0), // Chain
10800 Rsrc, // rsrc
10801 Op.getOperand(i: 3), // vindex
10802 VOffset, // voffset
10803 SOffset, // soffset
10804 Offset, // offset
10805 Op.getOperand(i: 6), // format
10806 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
10807 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10808 };
10809
10810 if (LoadVT.getScalarType() == MVT::f16)
10811 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10812 Ops);
10813 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10814 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
10815 DAG);
10816 }
10817 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10818 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10819 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
10820 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10821 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10822 return lowerStructBufferAtomicIntrin(Op, DAG,
10823 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
10824 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10825 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10826 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
10827 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10828 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10829 return lowerStructBufferAtomicIntrin(Op, DAG,
10830 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
10831 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10832 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10833 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
10834 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10835 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10836 return lowerStructBufferAtomicIntrin(Op, DAG,
10837 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
10838 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10839 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10840 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
10841 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10842 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10843 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
10844 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10845 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10846 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
10847 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10848 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10849 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
10850 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10851 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10852 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
10853 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10854 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10855 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
10856 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10857 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10858 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
10859 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10860 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10861 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
10862 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10863 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10864 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
10865 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10866 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10867 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
10868 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10870 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
10871 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10873 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
10874 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10875 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10876 return lowerStructBufferAtomicIntrin(Op, DAG,
10877 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
10878 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10880 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
10881 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10882 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10883 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
10884 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10885 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10886 return lowerStructBufferAtomicIntrin(Op, DAG,
10887 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
10888 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10890 return lowerStructBufferAtomicIntrin(Op, DAG,
10891 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
10892 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10893 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10894 return lowerStructBufferAtomicIntrin(Op, DAG,
10895 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
10896 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10897 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10898 return lowerStructBufferAtomicIntrin(Op, DAG,
10899 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
10900 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10902 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
10903 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10905 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
10906 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10907 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10908 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
10909 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10910 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10911 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
10912 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10913 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10914 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
10915 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10916 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10917 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
10918 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10920 return lowerStructBufferAtomicIntrin(Op, DAG,
10921 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
10922 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10924 return lowerRawBufferAtomicIntrin(Op, DAG,
10925 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10926 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10927 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10928 return lowerStructBufferAtomicIntrin(Op, DAG,
10929 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10930 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10931 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10932 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG);
10933 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
10934 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
10935 SDValue Ops[] = {
10936 Op.getOperand(i: 0), // Chain
10937 Op.getOperand(i: 2), // src
10938 Op.getOperand(i: 3), // cmp
10939 Rsrc, // rsrc
10940 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10941 VOffset, // voffset
10942 SOffset, // soffset
10943 Offset, // offset
10944 Op.getOperand(i: 7), // cachepolicy
10945 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10946 };
10947 EVT VT = Op.getValueType();
10948 auto *M = cast<MemSDNode>(Val&: Op);
10949
10950 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
10951 VTList: Op->getVTList(), Ops, MemVT: VT,
10952 MMO: M->getMemOperand());
10953 }
10954 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10955 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10956 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG);
10957 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG);
10958 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget);
10959 SDValue Ops[] = {
10960 Op.getOperand(i: 0), // Chain
10961 Op.getOperand(i: 2), // src
10962 Op.getOperand(i: 3), // cmp
10963 Rsrc, // rsrc
10964 Op.getOperand(i: 5), // vindex
10965 VOffset, // voffset
10966 SOffset, // soffset
10967 Offset, // offset
10968 Op.getOperand(i: 8), // cachepolicy
10969 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10970 };
10971 EVT VT = Op.getValueType();
10972 auto *M = cast<MemSDNode>(Val&: Op);
10973
10974 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
10975 VTList: Op->getVTList(), Ops, MemVT: VT,
10976 MMO: M->getMemOperand());
10977 }
10978 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10979 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10980 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10981 SDValue NodePtr = M->getOperand(Num: 2);
10982 SDValue RayExtent = M->getOperand(Num: 3);
10983 SDValue InstanceMask = M->getOperand(Num: 4);
10984 SDValue RayOrigin = M->getOperand(Num: 5);
10985 SDValue RayDir = M->getOperand(Num: 6);
10986 SDValue Offsets = M->getOperand(Num: 7);
10987 SDValue TDescr = M->getOperand(Num: 8);
10988
10989 assert(NodePtr.getValueType() == MVT::i64);
10990 assert(RayDir.getValueType() == MVT::v3f32);
10991
10992 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10993 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
10994 return SDValue();
10995 }
10996
10997 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10998 const unsigned NumVDataDwords = 10;
10999 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11000 int Opcode = AMDGPU::getMIMGOpcode(
11001 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11002 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11003 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11004 assert(Opcode != -1);
11005
11006 SmallVector<SDValue, 7> Ops;
11007 Ops.push_back(Elt: NodePtr);
11008 Ops.push_back(Elt: DAG.getBuildVector(
11009 VT: MVT::v2i32, DL,
11010 Ops: {DAG.getBitcast(VT: MVT::i32, V: RayExtent),
11011 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: InstanceMask)}));
11012 Ops.push_back(Elt: RayOrigin);
11013 Ops.push_back(Elt: RayDir);
11014 Ops.push_back(Elt: Offsets);
11015 Ops.push_back(Elt: TDescr);
11016 Ops.push_back(Elt: M->getChain());
11017
11018 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11019 MachineMemOperand *MemRef = M->getMemOperand();
11020 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11021 return SDValue(NewNode, 0);
11022 }
11023 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11024 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11025 SDValue NodePtr = M->getOperand(Num: 2);
11026 SDValue RayExtent = M->getOperand(Num: 3);
11027 SDValue RayOrigin = M->getOperand(Num: 4);
11028 SDValue RayDir = M->getOperand(Num: 5);
11029 SDValue RayInvDir = M->getOperand(Num: 6);
11030 SDValue TDescr = M->getOperand(Num: 7);
11031
11032 assert(NodePtr.getValueType() == MVT::i32 ||
11033 NodePtr.getValueType() == MVT::i64);
11034 assert(RayDir.getValueType() == MVT::v3f16 ||
11035 RayDir.getValueType() == MVT::v3f32);
11036
11037 if (!Subtarget->hasGFX10_AEncoding()) {
11038 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11039 return SDValue();
11040 }
11041
11042 const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget);
11043 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
11044 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11045 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11046 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11047 const unsigned NumVDataDwords = 4;
11048 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11049 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11050 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11051 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11052 IsGFX12Plus;
11053 const unsigned BaseOpcodes[2][2] = {
11054 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11055 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11056 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11057 int Opcode;
11058 if (UseNSA) {
11059 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11060 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11061 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11062 : AMDGPU::MIMGEncGfx10NSA,
11063 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11064 } else {
11065 assert(!IsGFX12Plus);
11066 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11067 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11068 : AMDGPU::MIMGEncGfx10Default,
11069 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11070 }
11071 assert(Opcode != -1);
11072
11073 SmallVector<SDValue, 16> Ops;
11074
11075 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11076 SmallVector<SDValue, 3> Lanes;
11077 DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3);
11078 if (Lanes[0].getValueSizeInBits() == 32) {
11079 for (unsigned I = 0; I < 3; ++I)
11080 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes[I]));
11081 } else {
11082 if (IsAligned) {
11083 Ops.push_back(Elt: DAG.getBitcast(
11084 VT: MVT::i32,
11085 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[0], Lanes[1]})));
11086 Ops.push_back(Elt: Lanes[2]);
11087 } else {
11088 SDValue Elt0 = Ops.pop_back_val();
11089 Ops.push_back(Elt: DAG.getBitcast(
11090 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Elt0, Lanes[0]})));
11091 Ops.push_back(Elt: DAG.getBitcast(
11092 VT: MVT::i32,
11093 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[1], Lanes[2]})));
11094 }
11095 }
11096 };
11097
11098 if (UseNSA && IsGFX11Plus) {
11099 Ops.push_back(Elt: NodePtr);
11100 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11101 Ops.push_back(Elt: RayOrigin);
11102 if (IsA16) {
11103 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11104 DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3);
11105 DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3);
11106 for (unsigned I = 0; I < 3; ++I) {
11107 MergedLanes.push_back(Elt: DAG.getBitcast(
11108 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL,
11109 Ops: {DirLanes[I], InvDirLanes[I]})));
11110 }
11111 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes));
11112 } else {
11113 Ops.push_back(Elt: RayDir);
11114 Ops.push_back(Elt: RayInvDir);
11115 }
11116 } else {
11117 if (Is64)
11118 DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: 0,
11119 Count: 2);
11120 else
11121 Ops.push_back(Elt: NodePtr);
11122
11123 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11124 packLanes(RayOrigin, true);
11125 packLanes(RayDir, true);
11126 packLanes(RayInvDir, false);
11127 }
11128
11129 if (!UseNSA) {
11130 // Build a single vector containing all the operands so far prepared.
11131 if (NumVAddrDwords > 12) {
11132 SDValue Undef = DAG.getPOISON(VT: MVT::i32);
11133 Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef);
11134 }
11135 assert(Ops.size() >= 8 && Ops.size() <= 12);
11136 SDValue MergedOps =
11137 DAG.getBuildVector(VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops);
11138 Ops.clear();
11139 Ops.push_back(Elt: MergedOps);
11140 }
11141
11142 Ops.push_back(Elt: TDescr);
11143 Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1));
11144 Ops.push_back(Elt: M->getChain());
11145
11146 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11147 MachineMemOperand *MemRef = M->getMemOperand();
11148 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11149 return SDValue(NewNode, 0);
11150 }
11151 case Intrinsic::amdgcn_global_atomic_fmin_num:
11152 case Intrinsic::amdgcn_global_atomic_fmax_num:
11153 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11154 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11155 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11156 SDValue Ops[] = {
11157 M->getOperand(Num: 0), // Chain
11158 M->getOperand(Num: 2), // Ptr
11159 M->getOperand(Num: 3) // Value
11160 };
11161 unsigned Opcode = 0;
11162 switch (IntrID) {
11163 case Intrinsic::amdgcn_global_atomic_fmin_num:
11164 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11165 Opcode = ISD::ATOMIC_LOAD_FMIN;
11166 break;
11167 }
11168 case Intrinsic::amdgcn_global_atomic_fmax_num:
11169 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11170 Opcode = ISD::ATOMIC_LOAD_FMAX;
11171 break;
11172 }
11173 default:
11174 llvm_unreachable("unhandled atomic opcode");
11175 }
11176 return DAG.getAtomic(Opcode, dl: SDLoc(Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(),
11177 Ops, MMO: M->getMemOperand());
11178 }
11179 case Intrinsic::amdgcn_s_get_barrier_state:
11180 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11181 SDValue Chain = Op->getOperand(Num: 0);
11182 SmallVector<SDValue, 2> Ops;
11183 unsigned Opc;
11184
11185 if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) {
11186 uint64_t BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getZExtValue();
11187 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11188 BarID = (BarID >> 4) & 0x3F;
11189 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11190 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11191 Ops.push_back(Elt: K);
11192 Ops.push_back(Elt: Chain);
11193 } else {
11194 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11195 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11196 SDValue M0Val;
11197 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Op->getOperand(Num: 2),
11198 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11199 M0Val = SDValue(
11200 DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11201 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11202 0);
11203 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11204 } else
11205 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: Op->getOperand(Num: 2)).getValue(R: 0));
11206 }
11207
11208 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11209 return SDValue(NewMI, 0);
11210 }
11211 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11212 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11213 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11214 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11215 SDValue Chain = Op->getOperand(Num: 0);
11216 SDValue Ptr = Op->getOperand(Num: 2);
11217 EVT VT = Op->getValueType(ResNo: 0);
11218 return DAG.getAtomicLoad(ExtType: ISD::NON_EXTLOAD, dl: DL, MemVT: MII->getMemoryVT(), VT,
11219 Chain, Ptr, MMO: MII->getMemOperand());
11220 }
11221 default:
11222
11223 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11224 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
11225 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
11226
11227 return SDValue();
11228 }
11229}
11230
11231// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11232// dwordx4 if on SI and handle TFE loads.
11233SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11234 SDVTList VTList,
11235 ArrayRef<SDValue> Ops, EVT MemVT,
11236 MachineMemOperand *MMO,
11237 SelectionDAG &DAG) const {
11238 LLVMContext &C = *DAG.getContext();
11239 MachineFunction &MF = DAG.getMachineFunction();
11240 EVT VT = VTList.VTs[0];
11241
11242 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11243 bool IsTFE = VTList.NumVTs == 3;
11244 if (IsTFE) {
11245 unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32);
11246 unsigned NumOpDWords = NumValueDWords + 1;
11247 EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords);
11248 SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]);
11249 MachineMemOperand *OpDWordsMMO =
11250 MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4);
11251 SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
11252 MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
11253 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
11254 N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL));
11255 SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL);
11256 SDValue ValueDWords =
11257 NumValueDWords == 1
11258 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx)
11259 : DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL,
11260 VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op,
11261 N2: ZeroIdx);
11262 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
11263 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
11264 }
11265
11266 if (!Subtarget->hasDwordx3LoadStores() &&
11267 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11268 EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4);
11269 EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4);
11270 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16);
11271 SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]);
11272 SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
11273 MemVT: WidenedMemVT, MMO: WidenedMMO);
11274 SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
11275 N2: DAG.getVectorIdxConstant(Val: 0, DL));
11276 return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL);
11277 }
11278
11279 return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
11280}
11281
11282SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11283 bool ImageStore) const {
11284 EVT StoreVT = VData.getValueType();
11285
11286 // No change for f16 and legal vector D16 types.
11287 if (!StoreVT.isVector())
11288 return VData;
11289
11290 SDLoc DL(VData);
11291 unsigned NumElements = StoreVT.getVectorNumElements();
11292
11293 if (Subtarget->hasUnpackedD16VMem()) {
11294 // We need to unpack the packed data to store.
11295 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11296 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11297
11298 EVT EquivStoreVT =
11299 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements);
11300 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
11301 return DAG.UnrollVectorOp(N: ZExt.getNode());
11302 }
11303
11304 // The sq block of gfx8.1 does not estimate register use correctly for d16
11305 // image store instructions. The data operand is computed as if it were not a
11306 // d16 image instruction.
11307 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11308 // Bitcast to i16
11309 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11310 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11311
11312 // Decompose into scalars
11313 SmallVector<SDValue, 4> Elts;
11314 DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
11315
11316 // Group pairs of i16 into v2i16 and bitcast to i32
11317 SmallVector<SDValue, 4> PackedElts;
11318 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11319 SDValue Pair =
11320 DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts[I * 2], Elts[I * 2 + 1]});
11321 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
11322 PackedElts.push_back(Elt: IntPair);
11323 }
11324 if ((NumElements % 2) == 1) {
11325 // Handle v3i16
11326 unsigned I = Elts.size() / 2;
11327 SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL,
11328 Ops: {Elts[I * 2], DAG.getPOISON(VT: MVT::i16)});
11329 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
11330 PackedElts.push_back(Elt: IntPair);
11331 }
11332
11333 // Pad using UNDEF
11334 PackedElts.resize(N: Elts.size(), NV: DAG.getPOISON(VT: MVT::i32));
11335
11336 // Build final vector
11337 EVT VecVT =
11338 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size());
11339 return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
11340 }
11341
11342 if (NumElements == 3) {
11343 EVT IntStoreVT =
11344 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
11345 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11346
11347 EVT WidenedStoreVT = EVT::getVectorVT(
11348 Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1);
11349 EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
11350 BitWidth: WidenedStoreVT.getStoreSizeInBits());
11351 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
11352 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
11353 }
11354
11355 assert(isTypeLegal(StoreVT));
11356 return VData;
11357}
11358
11359SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11360 SelectionDAG &DAG) const {
11361 SDLoc DL(Op);
11362 SDValue Chain = Op.getOperand(i: 0);
11363 unsigned IntrinsicID = Op.getConstantOperandVal(i: 1);
11364 MachineFunction &MF = DAG.getMachineFunction();
11365
11366 switch (IntrinsicID) {
11367 case Intrinsic::amdgcn_exp_compr: {
11368 if (!Subtarget->hasCompressedExport()) {
11369 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
11370 DAG.getMachineFunction().getFunction(),
11371 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11372 }
11373 SDValue Src0 = Op.getOperand(i: 4);
11374 SDValue Src1 = Op.getOperand(i: 5);
11375 // Hack around illegal type on SI by directly selecting it.
11376 if (isTypeLegal(VT: Src0.getValueType()))
11377 return SDValue();
11378
11379 const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6));
11380 SDValue Undef = DAG.getPOISON(VT: MVT::f32);
11381 const SDValue Ops[] = {
11382 Op.getOperand(i: 2), // tgt
11383 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0
11384 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1
11385 Undef, // src2
11386 Undef, // src3
11387 Op.getOperand(i: 7), // vm
11388 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // compr
11389 Op.getOperand(i: 3), // en
11390 Op.getOperand(i: 0) // Chain
11391 };
11392
11393 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11394 return SDValue(DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops), 0);
11395 }
11396
11397 case Intrinsic::amdgcn_struct_tbuffer_store:
11398 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11399 SDValue VData = Op.getOperand(i: 2);
11400 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11401 if (IsD16)
11402 VData = handleD16VData(VData, DAG);
11403 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11404 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
11405 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
11406 SDValue Ops[] = {
11407 Chain,
11408 VData, // vdata
11409 Rsrc, // rsrc
11410 Op.getOperand(i: 4), // vindex
11411 VOffset, // voffset
11412 SOffset, // soffset
11413 Offset, // offset
11414 Op.getOperand(i: 7), // format
11415 Op.getOperand(i: 8), // cachepolicy, swizzled buffer
11416 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11417 };
11418 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11419 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11420 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11421 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11422 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11423 }
11424
11425 case Intrinsic::amdgcn_raw_tbuffer_store:
11426 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11427 SDValue VData = Op.getOperand(i: 2);
11428 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11429 if (IsD16)
11430 VData = handleD16VData(VData, DAG);
11431 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11432 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11433 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11434 SDValue Ops[] = {
11435 Chain,
11436 VData, // vdata
11437 Rsrc, // rsrc
11438 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11439 VOffset, // voffset
11440 SOffset, // soffset
11441 Offset, // offset
11442 Op.getOperand(i: 6), // format
11443 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
11444 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11445 };
11446 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11447 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11448 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11449 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11450 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11451 }
11452
11453 case Intrinsic::amdgcn_raw_buffer_store:
11454 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11455 case Intrinsic::amdgcn_raw_buffer_store_format:
11456 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11457 const bool IsFormat =
11458 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11459 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11460
11461 SDValue VData = Op.getOperand(i: 2);
11462 EVT VDataVT = VData.getValueType();
11463 EVT EltType = VDataVT.getScalarType();
11464 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11465 if (IsD16) {
11466 VData = handleD16VData(VData, DAG);
11467 VDataVT = VData.getValueType();
11468 }
11469
11470 if (!isTypeLegal(VT: VDataVT)) {
11471 VData =
11472 DAG.getNode(Opcode: ISD::BITCAST, DL,
11473 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
11474 }
11475
11476 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11477 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11478 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11479 SDValue Ops[] = {
11480 Chain,
11481 VData,
11482 Rsrc,
11483 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11484 VOffset, // voffset
11485 SOffset, // soffset
11486 Offset, // offset
11487 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
11488 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11489 };
11490 unsigned Opc =
11491 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11492 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11493 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11494
11495 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11496 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11497 return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
11498
11499 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11500 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11501 }
11502
11503 case Intrinsic::amdgcn_struct_buffer_store:
11504 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11505 case Intrinsic::amdgcn_struct_buffer_store_format:
11506 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11507 const bool IsFormat =
11508 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11509 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11510
11511 SDValue VData = Op.getOperand(i: 2);
11512 EVT VDataVT = VData.getValueType();
11513 EVT EltType = VDataVT.getScalarType();
11514 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11515
11516 if (IsD16) {
11517 VData = handleD16VData(VData, DAG);
11518 VDataVT = VData.getValueType();
11519 }
11520
11521 if (!isTypeLegal(VT: VDataVT)) {
11522 VData =
11523 DAG.getNode(Opcode: ISD::BITCAST, DL,
11524 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
11525 }
11526
11527 auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11528 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
11529 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
11530 SDValue Ops[] = {
11531 Chain,
11532 VData,
11533 Rsrc,
11534 Op.getOperand(i: 4), // vindex
11535 VOffset, // voffset
11536 SOffset, // soffset
11537 Offset, // offset
11538 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
11539 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11540 };
11541 unsigned Opc =
11542 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11543 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11544 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11545
11546 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11547 EVT VDataType = VData.getValueType().getScalarType();
11548 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11549 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11550
11551 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11552 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11553 }
11554 case Intrinsic::amdgcn_raw_buffer_load_lds:
11555 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11556 case Intrinsic::amdgcn_struct_buffer_load_lds:
11557 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11558 if (!Subtarget->hasVMemToLDSLoad())
11559 return SDValue();
11560 unsigned Opc;
11561 bool HasVIndex =
11562 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11563 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11564 unsigned OpOffset = HasVIndex ? 1 : 0;
11565 SDValue VOffset = Op.getOperand(i: 5 + OpOffset);
11566 bool HasVOffset = !isNullConstant(V: VOffset);
11567 unsigned Size = Op->getConstantOperandVal(Num: 4);
11568
11569 switch (Size) {
11570 default:
11571 return SDValue();
11572 case 1:
11573 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11574 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11575 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11576 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11577 break;
11578 case 2:
11579 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11580 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11581 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11582 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11583 break;
11584 case 4:
11585 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11586 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11587 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11588 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11589 break;
11590 case 12:
11591 if (!Subtarget->hasLDSLoadB96_B128())
11592 return SDValue();
11593 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11594 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11595 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11596 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11597 break;
11598 case 16:
11599 if (!Subtarget->hasLDSLoadB96_B128())
11600 return SDValue();
11601 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11602 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11603 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11604 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11605 break;
11606 }
11607
11608 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
11609
11610 SmallVector<SDValue, 8> Ops;
11611
11612 if (HasVIndex && HasVOffset)
11613 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL,
11614 Ops: {Op.getOperand(i: 5), // VIndex
11615 VOffset}));
11616 else if (HasVIndex)
11617 Ops.push_back(Elt: Op.getOperand(i: 5));
11618 else if (HasVOffset)
11619 Ops.push_back(Elt: VOffset);
11620
11621 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
11622 Ops.push_back(Elt: Rsrc);
11623 Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset
11624 Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset
11625 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11626 unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset);
11627 Ops.push_back(Elt: DAG.getTargetConstant(
11628 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11629 DL, VT: MVT::i8)); // cpol
11630 Ops.push_back(Elt: DAG.getTargetConstant(
11631 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11632 ? 1
11633 : 0,
11634 DL, VT: MVT::i8)); // swz
11635 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
11636 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
11637
11638 auto *M = cast<MemSDNode>(Val&: Op);
11639 MachineMemOperand *LoadMMO = M->getMemOperand();
11640 // Don't set the offset value here because the pointer points to the base of
11641 // the buffer.
11642 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11643
11644 MachinePointerInfo StorePtrI = LoadPtrI;
11645 LoadPtrI.V = PoisonValue::get(
11646 T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::BUFFER_RESOURCE));
11647 LoadPtrI.AddrSpace = AMDGPUAS::BUFFER_RESOURCE;
11648 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
11649
11650 auto F = LoadMMO->getFlags() &
11651 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
11652 LoadMMO =
11653 MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size,
11654 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
11655
11656 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11657 PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t),
11658 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
11659
11660 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
11661 DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
11662
11663 return SDValue(Load, 0);
11664 }
11665 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11666 // for "trust me" that the remaining cases are global pointers until
11667 // such time as we can put two mem operands on an intrinsic.
11668 case Intrinsic::amdgcn_load_to_lds:
11669 case Intrinsic::amdgcn_global_load_lds: {
11670 if (!Subtarget->hasVMemToLDSLoad())
11671 return SDValue();
11672
11673 unsigned Opc;
11674 unsigned Size = Op->getConstantOperandVal(Num: 4);
11675 switch (Size) {
11676 default:
11677 return SDValue();
11678 case 1:
11679 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11680 break;
11681 case 2:
11682 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11683 break;
11684 case 4:
11685 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11686 break;
11687 case 12:
11688 if (!Subtarget->hasLDSLoadB96_B128())
11689 return SDValue();
11690 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11691 break;
11692 case 16:
11693 if (!Subtarget->hasLDSLoadB96_B128())
11694 return SDValue();
11695 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11696 break;
11697 }
11698
11699 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
11700
11701 SmallVector<SDValue, 6> Ops;
11702
11703 SDValue Addr = Op.getOperand(i: 2); // Global ptr
11704 SDValue VOffset;
11705 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11706 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11707 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11708 SDValue LHS = Addr.getOperand(i: 0);
11709 SDValue RHS = Addr.getOperand(i: 1);
11710
11711 if (LHS->isDivergent())
11712 std::swap(a&: LHS, b&: RHS);
11713
11714 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11715 RHS.getOperand(i: 0).getValueType() == MVT::i32) {
11716 // add (i64 sgpr), (zero_extend (i32 vgpr))
11717 Addr = LHS;
11718 VOffset = RHS.getOperand(i: 0);
11719 }
11720 }
11721
11722 Ops.push_back(Elt: Addr);
11723 if (!Addr->isDivergent()) {
11724 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
11725 if (!VOffset)
11726 VOffset =
11727 SDValue(DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32,
11728 Op1: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
11729 0);
11730 Ops.push_back(Elt: VOffset);
11731 }
11732
11733 Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset
11734
11735 unsigned Aux = Op.getConstantOperandVal(i: 6);
11736 Ops.push_back(Elt: DAG.getTargetConstant(Val: Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11737 VT: MVT::i32)); // CPol
11738
11739 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
11740 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
11741
11742 auto *M = cast<MemSDNode>(Val&: Op);
11743 MachineMemOperand *LoadMMO = M->getMemOperand();
11744 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11745 LoadPtrI.Offset = Op->getConstantOperandVal(Num: 5);
11746 MachinePointerInfo StorePtrI = LoadPtrI;
11747 LoadPtrI.V = PoisonValue::get(
11748 T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
11749 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
11750 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
11751 auto F = LoadMMO->getFlags() &
11752 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
11753 LoadMMO =
11754 MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size,
11755 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
11756 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11757 PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t), BaseAlignment: Align(4),
11758 AAInfo: LoadMMO->getAAInfo());
11759
11760 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11761 DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
11762
11763 return SDValue(Load, 0);
11764 }
11765 case Intrinsic::amdgcn_end_cf:
11766 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other,
11767 Op1: Op->getOperand(Num: 2), Op2: Chain),
11768 0);
11769 case Intrinsic::amdgcn_s_barrier_init:
11770 case Intrinsic::amdgcn_s_barrier_signal_var: {
11771 // these two intrinsics have two operands: barrier pointer and member count
11772 SDValue Chain = Op->getOperand(Num: 0);
11773 SmallVector<SDValue, 2> Ops;
11774 SDValue BarOp = Op->getOperand(Num: 2);
11775 SDValue CntOp = Op->getOperand(Num: 3);
11776 SDValue M0Val;
11777 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11778 ? AMDGPU::S_BARRIER_INIT_M0
11779 : AMDGPU::S_BARRIER_SIGNAL_M0;
11780 // extract the BarrierID from bits 4-9 of BarOp
11781 SDValue BarID;
11782 BarID = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
11783 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11784 BarID =
11785 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: BarID,
11786 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11787 0);
11788 // Member count should be put into M0[ShAmt:+6]
11789 // Barrier ID should be put into M0[5:0]
11790 M0Val =
11791 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: CntOp,
11792 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11793 0);
11794 constexpr unsigned ShAmt = 16;
11795 M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: CntOp,
11796 N2: DAG.getShiftAmountConstant(Val: ShAmt, VT: MVT::i32, DL));
11797
11798 M0Val = SDValue(
11799 DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: M0Val, Op2: BarID), 0);
11800
11801 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11802
11803 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11804 return SDValue(NewMI, 0);
11805 }
11806 case Intrinsic::amdgcn_s_wakeup_barrier: {
11807 if (!Subtarget->hasSWakeupBarrier())
11808 return SDValue();
11809 [[fallthrough]];
11810 }
11811 case Intrinsic::amdgcn_s_barrier_join: {
11812 // these three intrinsics have one operand: barrier pointer
11813 SDValue Chain = Op->getOperand(Num: 0);
11814 SmallVector<SDValue, 2> Ops;
11815 SDValue BarOp = Op->getOperand(Num: 2);
11816 unsigned Opc;
11817
11818 if (isa<ConstantSDNode>(Val: BarOp)) {
11819 uint64_t BarVal = cast<ConstantSDNode>(Val&: BarOp)->getZExtValue();
11820 switch (IntrinsicID) {
11821 default:
11822 return SDValue();
11823 case Intrinsic::amdgcn_s_barrier_join:
11824 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11825 break;
11826 case Intrinsic::amdgcn_s_wakeup_barrier:
11827 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11828 break;
11829 }
11830 // extract the BarrierID from bits 4-9 of the immediate
11831 unsigned BarID = (BarVal >> 4) & 0x3F;
11832 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11833 Ops.push_back(Elt: K);
11834 Ops.push_back(Elt: Chain);
11835 } else {
11836 switch (IntrinsicID) {
11837 default:
11838 return SDValue();
11839 case Intrinsic::amdgcn_s_barrier_join:
11840 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11841 break;
11842 case Intrinsic::amdgcn_s_wakeup_barrier:
11843 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11844 break;
11845 }
11846 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11847 SDValue M0Val;
11848 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
11849 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11850 M0Val =
11851 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11852 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11853 0);
11854 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11855 }
11856
11857 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11858 return SDValue(NewMI, 0);
11859 }
11860 case Intrinsic::amdgcn_s_prefetch_data: {
11861 // For non-global address space preserve the chain and remove the call.
11862 if (!AMDGPU::isFlatGlobalAddrSpace(AS: cast<MemSDNode>(Val&: Op)->getAddressSpace()))
11863 return Op.getOperand(i: 0);
11864 return Op;
11865 }
11866 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11867 SDValue Ops[] = {
11868 Chain, bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG),
11869 Op.getOperand(i: 3), // offset
11870 Op.getOperand(i: 4), // length
11871 };
11872
11873 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11874 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_PREFETCH_DATA, dl: DL,
11875 VTList: Op->getVTList(), Ops, MemVT: M->getMemoryVT(),
11876 MMO: M->getMemOperand());
11877 }
11878 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11879 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11880 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11881 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11882 SDValue Chain = Op->getOperand(Num: 0);
11883 SDValue Ptr = Op->getOperand(Num: 2);
11884 SDValue Val = Op->getOperand(Num: 3);
11885 return DAG.getAtomic(Opcode: ISD::ATOMIC_STORE, dl: DL, MemVT: MII->getMemoryVT(), Chain, Ptr: Val,
11886 Val: Ptr, MMO: MII->getMemOperand());
11887 }
11888 default: {
11889 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11890 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
11891 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
11892
11893 return Op;
11894 }
11895 }
11896}
11897
11898// Return whether the operation has NoUnsignedWrap property.
11899static bool isNoUnsignedWrap(SDValue Addr) {
11900 return (Addr.getOpcode() == ISD::ADD &&
11901 Addr->getFlags().hasNoUnsignedWrap()) ||
11902 Addr->getOpcode() == ISD::OR;
11903}
11904
11905bool SITargetLowering::shouldPreservePtrArith(const Function &F,
11906 EVT PtrVT) const {
11907 return PtrVT == MVT::i64;
11908}
11909
11910bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
11911 EVT PtrVT) const {
11912 return true;
11913}
11914
11915// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11916// offset (the offset that is included in bounds checking and swizzling, to be
11917// split between the instruction's voffset and immoffset fields) and soffset
11918// (the offset that is excluded from bounds checking and swizzling, to go in
11919// the instruction's soffset field). This function takes the first kind of
11920// offset and figures out how to split it between voffset and immoffset.
11921std::pair<SDValue, SDValue>
11922SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11923 SDLoc DL(Offset);
11924 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
11925 SDValue N0 = Offset;
11926 ConstantSDNode *C1 = nullptr;
11927
11928 if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
11929 N0 = SDValue();
11930 else if (DAG.isBaseWithConstantOffset(Op: N0)) {
11931 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11932 // being added, so we can only safely match a 32-bit addition with no
11933 // unsigned overflow.
11934 bool CheckNUW = Subtarget->hasGFX1250Insts();
11935 if (!CheckNUW || isNoUnsignedWrap(Addr: N0)) {
11936 C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
11937 N0 = N0.getOperand(i: 0);
11938 }
11939 }
11940
11941 if (C1) {
11942 unsigned ImmOffset = C1->getZExtValue();
11943 // If the immediate value is too big for the immoffset field, put only bits
11944 // that would normally fit in the immoffset field. The remaining value that
11945 // is copied/added for the voffset field is a large power of 2, and it
11946 // stands more chance of being CSEd with the copy/add for another similar
11947 // load/store.
11948 // However, do not do that rounding down if that is a negative
11949 // number, as it appears to be illegal to have a negative offset in the
11950 // vgpr, even if adding the immediate offset makes it positive.
11951 unsigned Overflow = ImmOffset & ~MaxImm;
11952 ImmOffset -= Overflow;
11953 if ((int32_t)Overflow < 0) {
11954 Overflow += ImmOffset;
11955 ImmOffset = 0;
11956 }
11957 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32));
11958 if (Overflow) {
11959 auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32);
11960 if (!N0)
11961 N0 = OverflowVal;
11962 else {
11963 SDValue Ops[] = {N0, OverflowVal};
11964 N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops);
11965 }
11966 }
11967 }
11968 if (!N0)
11969 N0 = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
11970 if (!C1)
11971 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
11972 return {N0, SDValue(C1, 0)};
11973}
11974
11975// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11976// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11977// pointed to by Offsets.
11978void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11979 SelectionDAG &DAG, SDValue *Offsets,
11980 Align Alignment) const {
11981 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11982 SDLoc DL(CombinedOffset);
11983 if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
11984 uint32_t Imm = C->getZExtValue();
11985 uint32_t SOffset, ImmOffset;
11986 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11987 Offsets[0] = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
11988 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
11989 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
11990 return;
11991 }
11992 }
11993 if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
11994 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11995 // being added, so we can only safely match a 32-bit addition with no
11996 // unsigned overflow.
11997 bool CheckNUW = Subtarget->hasGFX1250Insts();
11998 SDValue N0 = CombinedOffset.getOperand(i: 0);
11999 SDValue N1 = CombinedOffset.getOperand(i: 1);
12000 uint32_t SOffset, ImmOffset;
12001 int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
12002 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(Addr: CombinedOffset)) &&
12003 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
12004 Offsets[0] = N0;
12005 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12006 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12007 return;
12008 }
12009 }
12010
12011 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12012 ? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
12013 : DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12014
12015 Offsets[0] = CombinedOffset;
12016 Offsets[1] = SOffsetZero;
12017 Offsets[2] = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
12018}
12019
12020SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12021 SelectionDAG &DAG) const {
12022 if (!MaybePointer.getValueType().isScalarInteger())
12023 return MaybePointer;
12024
12025 SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer);
12026 return Rsrc;
12027}
12028
12029// Wrap a global or flat pointer into a buffer intrinsic using the flags
12030// specified in the intrinsic.
12031SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12032 SelectionDAG &DAG) const {
12033 SDLoc Loc(Op);
12034
12035 SDValue Pointer = Op->getOperand(Num: 1);
12036 SDValue Stride = Op->getOperand(Num: 2);
12037 SDValue NumRecords = Op->getOperand(Num: 3);
12038 SDValue Flags = Op->getOperand(Num: 4);
12039
12040 SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32);
12041 SDValue Rsrc;
12042
12043 if (Subtarget->has45BitNumRecordsBufferResource()) {
12044 SDValue Zero = DAG.getConstant(Val: 0, DL: Loc, VT: MVT::i32);
12045 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12046 // num_records.
12047 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Op: Pointer, DL: Loc, VT: MVT::i64);
12048 SDValue NumRecordsLHS =
12049 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12050 N2: DAG.getShiftAmountConstant(Val: 57, VT: MVT::i32, DL: Loc));
12051 SDValue LowHalf =
12052 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: ExtPointer, N2: NumRecordsLHS);
12053
12054 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12055 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12056 SDValue NumRecordsRHS =
12057 DAG.getNode(Opcode: ISD::SRL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12058 N2: DAG.getShiftAmountConstant(Val: 7, VT: MVT::i32, DL: Loc));
12059 SDValue ShiftedStride =
12060 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12061 N2: DAG.getShiftAmountConstant(Val: 12, VT: MVT::i32, DL: Loc));
12062 SDValue ExtShiftedStrideVec =
12063 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedStride);
12064 SDValue ExtShiftedStride =
12065 DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedStrideVec);
12066 SDValue ShiftedFlags =
12067 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: Flags,
12068 N2: DAG.getShiftAmountConstant(Val: 28, VT: MVT::i32, DL: Loc));
12069 SDValue ExtShiftedFlagsVec =
12070 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedFlags);
12071 SDValue ExtShiftedFlags =
12072 DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedFlagsVec);
12073 SDValue CombinedFields =
12074 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: NumRecordsRHS, N2: ExtShiftedStride);
12075 SDValue HighHalf =
12076 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: CombinedFields, N2: ExtShiftedFlags);
12077
12078 Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i64, N1: LowHalf, N2: HighHalf);
12079 } else {
12080 NumRecords = DAG.getAnyExtOrTrunc(Op: NumRecords, DL: Loc, VT: MVT::i32);
12081 auto [LowHalf, HighHalf] =
12082 DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32);
12083 SDValue Mask = DAG.getConstant(Val: 0x0000ffff, DL: Loc, VT: MVT::i32);
12084 SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask);
12085 SDValue ShiftedStride =
12086 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12087 N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL: Loc));
12088 SDValue NewHighHalf =
12089 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride);
12090
12091 Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf, N2: NewHighHalf,
12092 N3: NumRecords, N4: Flags);
12093 }
12094
12095 SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc);
12096 return RsrcPtr;
12097}
12098
12099// Handle 8 bit and 16 bit buffer loads
12100SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12101 EVT LoadVT, SDLoc DL,
12102 ArrayRef<SDValue> Ops,
12103 MachineMemOperand *MMO,
12104 bool IsTFE) const {
12105 EVT IntVT = LoadVT.changeTypeToInteger();
12106
12107 if (IsTFE) {
12108 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12109 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12110 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12111 MachineFunction &MF = DAG.getMachineFunction();
12112 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 8);
12113 SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other);
12114 SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG);
12115 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12116 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
12117 SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12118 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32));
12119 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data);
12120 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc);
12121 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
12122 }
12123
12124 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12125 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12126 : AMDGPUISD::BUFFER_LOAD_USHORT;
12127
12128 SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
12129 SDValue BufferLoad =
12130 DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
12131 SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
12132 LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
12133
12134 return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL);
12135}
12136
12137// Handle 8 bit and 16 bit buffer stores
12138SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12139 EVT VDataType, SDLoc DL,
12140 SDValue Ops[],
12141 MemSDNode *M) const {
12142 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12143 Ops[1] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[1]);
12144
12145 SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[1]);
12146 Ops[1] = BufferStoreExt;
12147 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12148 : AMDGPUISD::BUFFER_STORE_SHORT;
12149 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12150 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
12151 MMO: M->getMemOperand());
12152}
12153
12154static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
12155 SDValue Op, const SDLoc &SL, EVT VT) {
12156 if (VT.bitsLT(VT: Op.getValueType()))
12157 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
12158
12159 switch (ExtType) {
12160 case ISD::SEXTLOAD:
12161 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
12162 case ISD::ZEXTLOAD:
12163 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
12164 case ISD::EXTLOAD:
12165 return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
12166 case ISD::NON_EXTLOAD:
12167 return Op;
12168 }
12169
12170 llvm_unreachable("invalid ext type");
12171}
12172
12173// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12174// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12175SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12176 DAGCombinerInfo &DCI) const {
12177 SelectionDAG &DAG = DCI.DAG;
12178 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12179 return SDValue();
12180
12181 // FIXME: Constant loads should all be marked invariant.
12182 unsigned AS = Ld->getAddressSpace();
12183 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12184 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
12185 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12186 return SDValue();
12187
12188 // Don't do this early, since it may interfere with adjacent load merging for
12189 // illegal types. We can avoid losing alignment information for exotic types
12190 // pre-legalize.
12191 EVT MemVT = Ld->getMemoryVT();
12192 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12193 MemVT.getSizeInBits() >= 32)
12194 return SDValue();
12195
12196 SDLoc SL(Ld);
12197
12198 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12199 "unexpected vector extload");
12200
12201 // TODO: Drop only high part of range.
12202 SDValue Ptr = Ld->getBasePtr();
12203 SDValue NewLoad = DAG.getLoad(
12204 AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr,
12205 Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(),
12206 MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(),
12207 Ranges: nullptr); // Drop ranges
12208
12209 EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
12210 if (MemVT.isFloatingPoint()) {
12211 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
12212 "unexpected fp extload");
12213 TruncVT = MemVT.changeTypeToInteger();
12214 }
12215
12216 SDValue Cvt = NewLoad;
12217 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12218 Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad,
12219 N2: DAG.getValueType(TruncVT));
12220 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12221 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
12222 Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
12223 } else {
12224 assert(Ld->getExtensionType() == ISD::EXTLOAD);
12225 }
12226
12227 EVT VT = Ld->getValueType(ResNo: 0);
12228 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
12229
12230 DCI.AddToWorklist(N: Cvt.getNode());
12231
12232 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12233 // the appropriate extension from the 32-bit load.
12234 Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
12235 DCI.AddToWorklist(N: Cvt.getNode());
12236
12237 // Handle conversion back to floating point if necessary.
12238 Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
12239
12240 return DAG.getMergeValues(Ops: {Cvt, NewLoad.getValue(R: 1)}, dl: SL);
12241}
12242
12243static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
12244 const SIMachineFunctionInfo &Info) {
12245 // TODO: Should check if the address can definitely not access stack.
12246 if (Info.isEntryFunction())
12247 return Info.getUserSGPRInfo().hasFlatScratchInit();
12248 return true;
12249}
12250
12251SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12252 SDLoc DL(Op);
12253 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
12254 ISD::LoadExtType ExtType = Load->getExtensionType();
12255 EVT MemVT = Load->getMemoryVT();
12256 MachineMemOperand *MMO = Load->getMemOperand();
12257
12258 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12259 if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16))
12260 return SDValue();
12261
12262 // FIXME: Copied from PPC
12263 // First, load into 32 bits, then truncate to 1 bit.
12264
12265 SDValue Chain = Load->getChain();
12266 SDValue BasePtr = Load->getBasePtr();
12267
12268 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12269
12270 SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain, Ptr: BasePtr,
12271 MemVT: RealMemVT, MMO);
12272
12273 if (!MemVT.isVector()) {
12274 SDValue Ops[] = {DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
12275 NewLD.getValue(R: 1)};
12276
12277 return DAG.getMergeValues(Ops, dl: DL);
12278 }
12279
12280 SmallVector<SDValue, 3> Elts;
12281 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12282 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD,
12283 N2: DAG.getConstant(Val: I, DL, VT: MVT::i32));
12284
12285 Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt));
12286 }
12287
12288 SDValue Ops[] = {DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), NewLD.getValue(R: 1)};
12289
12290 return DAG.getMergeValues(Ops, dl: DL);
12291 }
12292
12293 if (!MemVT.isVector())
12294 return SDValue();
12295
12296 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12297 "Custom lowering for non-i32 vectors hasn't been implemented.");
12298
12299 Align Alignment = Load->getAlign();
12300 unsigned AS = Load->getAddressSpace();
12301 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12302 AS == AMDGPUAS::FLAT_ADDRESS &&
12303 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12304 return SplitVectorLoad(Op, DAG);
12305 }
12306
12307 MachineFunction &MF = DAG.getMachineFunction();
12308 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12309 // If there is a possibility that flat instruction access scratch memory
12310 // then we need to use the same legalization rules we use for private.
12311 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12312 !Subtarget->hasMultiDwordFlatScratchAddressing())
12313 AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI)
12314 ? AMDGPUAS::PRIVATE_ADDRESS
12315 : AMDGPUAS::GLOBAL_ADDRESS;
12316
12317 unsigned NumElements = MemVT.getVectorNumElements();
12318
12319 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12320 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
12321 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12322 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12323 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(N: Load)))) {
12324 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12325 Alignment >= Align(4) && NumElements < 32) {
12326 if (MemVT.isPow2VectorType() ||
12327 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12328 return SDValue();
12329 return WidenOrSplitVectorLoad(Op, DAG);
12330 }
12331 // Non-uniform loads will be selected to MUBUF instructions, so they
12332 // have the same legalization requirements as global and private
12333 // loads.
12334 //
12335 }
12336 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12337 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
12338 AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
12339 if (NumElements > 4)
12340 return SplitVectorLoad(Op, DAG);
12341 // v3 loads not supported on SI.
12342 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12343 return WidenOrSplitVectorLoad(Op, DAG);
12344
12345 // v3 and v4 loads are supported for private and global memory.
12346 return SDValue();
12347 }
12348 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12349 // Depending on the setting of the private_element_size field in the
12350 // resource descriptor, we can only make private accesses up to a certain
12351 // size.
12352 switch (Subtarget->getMaxPrivateElementSize()) {
12353 case 4: {
12354 auto [Op0, Op1] = scalarizeVectorLoad(LD: Load, DAG);
12355 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
12356 }
12357 case 8:
12358 if (NumElements > 2)
12359 return SplitVectorLoad(Op, DAG);
12360 return SDValue();
12361 case 16:
12362 // Same as global/flat
12363 if (NumElements > 4)
12364 return SplitVectorLoad(Op, DAG);
12365 // v3 loads not supported on SI.
12366 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12367 return WidenOrSplitVectorLoad(Op, DAG);
12368
12369 return SDValue();
12370 default:
12371 llvm_unreachable("unsupported private_element_size");
12372 }
12373 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12374 unsigned Fast = 0;
12375 auto Flags = Load->getMemOperand()->getFlags();
12376 if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
12377 Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
12378 Fast > 1)
12379 return SDValue();
12380
12381 if (MemVT.isVector())
12382 return SplitVectorLoad(Op, DAG);
12383 }
12384
12385 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
12386 VT: MemVT, MMO: *Load->getMemOperand())) {
12387 auto [Op0, Op1] = expandUnalignedLoad(LD: Load, DAG);
12388 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
12389 }
12390
12391 return SDValue();
12392}
12393
12394SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12395 EVT VT = Op.getValueType();
12396 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12397 VT.getSizeInBits() == 512)
12398 return splitTernaryVectorOp(Op, DAG);
12399
12400 assert(VT.getSizeInBits() == 64);
12401
12402 SDLoc DL(Op);
12403 SDValue Cond = DAG.getFreeze(V: Op.getOperand(i: 0));
12404
12405 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12406 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
12407
12408 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
12409 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 2));
12410
12411 SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero);
12412 SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero);
12413
12414 SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1);
12415
12416 SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One);
12417 SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One);
12418
12419 SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1);
12420
12421 SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi});
12422 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
12423}
12424
12425// Catch division cases where we can use shortcuts with rcp and rsq
12426// instructions.
12427SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12428 SelectionDAG &DAG) const {
12429 SDLoc SL(Op);
12430 SDValue LHS = Op.getOperand(i: 0);
12431 SDValue RHS = Op.getOperand(i: 1);
12432 EVT VT = Op.getValueType();
12433 const SDNodeFlags Flags = Op->getFlags();
12434
12435 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12436
12437 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
12438 // Without !fpmath accuracy information, we can't do more because we don't
12439 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12440 // f16 is always accurate enough
12441 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12442 return SDValue();
12443
12444 if (CLHS->isExactlyValue(V: 1.0)) {
12445 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12446 // the CI documentation has a worst case error of 1 ulp.
12447 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12448 // use it as long as we aren't trying to use denormals.
12449 //
12450 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12451
12452 // 1.0 / sqrt(x) -> rsq(x)
12453
12454 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12455 // error seems really high at 2^29 ULP.
12456 // 1.0 / x -> rcp(x)
12457 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
12458 }
12459
12460 // Same as for 1.0, but expand the sign out of the constant.
12461 if (CLHS->isExactlyValue(V: -1.0)) {
12462 // -1.0 / x -> rcp (fneg x)
12463 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
12464 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
12465 }
12466 }
12467
12468 // For f16 and bf16 require afn or arcp.
12469 // For f32 require afn.
12470 if (!AllowInaccurateRcp &&
12471 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12472 return SDValue();
12473
12474 // Turn into multiply by the reciprocal.
12475 // x / y -> x * (1.0 / y)
12476 SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
12477 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
12478}
12479
12480SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12481 SelectionDAG &DAG) const {
12482 SDLoc SL(Op);
12483 SDValue X = Op.getOperand(i: 0);
12484 SDValue Y = Op.getOperand(i: 1);
12485 EVT VT = Op.getValueType();
12486 const SDNodeFlags Flags = Op->getFlags();
12487
12488 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12489 if (!AllowInaccurateDiv)
12490 return SDValue();
12491
12492 SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
12493 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
12494
12495 SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
12496 SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
12497
12498 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
12499 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
12500 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
12501 SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
12502 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
12503 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
12504}
12505
12506static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12507 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12508 SDNodeFlags Flags) {
12509 if (GlueChain->getNumValues() <= 1) {
12510 return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
12511 }
12512
12513 assert(GlueChain->getNumValues() == 3);
12514
12515 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
12516 switch (Opcode) {
12517 default:
12518 llvm_unreachable("no chain equivalent for opcode");
12519 case ISD::FMUL:
12520 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12521 break;
12522 }
12523
12524 return DAG.getNode(Opcode, DL: SL, VTList,
12525 Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)},
12526 Flags);
12527}
12528
12529static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12530 EVT VT, SDValue A, SDValue B, SDValue C,
12531 SDValue GlueChain, SDNodeFlags Flags) {
12532 if (GlueChain->getNumValues() <= 1) {
12533 return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
12534 }
12535
12536 assert(GlueChain->getNumValues() == 3);
12537
12538 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
12539 switch (Opcode) {
12540 default:
12541 llvm_unreachable("no chain equivalent for opcode");
12542 case ISD::FMA:
12543 Opcode = AMDGPUISD::FMA_W_CHAIN;
12544 break;
12545 }
12546
12547 return DAG.getNode(Opcode, DL: SL, VTList,
12548 Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)},
12549 Flags);
12550}
12551
12552SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12553 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12554 return FastLowered;
12555
12556 SDLoc SL(Op);
12557 EVT VT = Op.getValueType();
12558 SDValue LHS = Op.getOperand(i: 0);
12559 SDValue RHS = Op.getOperand(i: 1);
12560
12561 SDValue LHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: LHS);
12562 SDValue RHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: RHS);
12563
12564 if (VT == MVT::bf16) {
12565 SDValue ExtDiv =
12566 DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT: MVT::f32, N1: LHSExt, N2: RHSExt, Flags: Op->getFlags());
12567 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ExtDiv,
12568 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
12569 }
12570
12571 assert(VT == MVT::f16);
12572
12573 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12574 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12575 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12576 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12577 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12578 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12579 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12580 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12581 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12582 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12583 // q16.u = opx(V_CVT_F16_F32, q32.u);
12584 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12585
12586 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12587 unsigned FMADOpCode =
12588 isOperationLegal(Op: ISD::FMAD, VT: MVT::f32) ? ISD::FMAD : ISD::FMA;
12589 SDValue NegRHSExt = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHSExt);
12590 SDValue Rcp =
12591 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: RHSExt, Flags: Op->getFlags());
12592 SDValue Quot =
12593 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHSExt, N2: Rcp, Flags: Op->getFlags());
12594 SDValue Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
12595 Flags: Op->getFlags());
12596 Quot = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, N3: Quot, Flags: Op->getFlags());
12597 Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
12598 Flags: Op->getFlags());
12599 SDValue Tmp = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, Flags: Op->getFlags());
12600 SDValue TmpCast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Tmp);
12601 TmpCast = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TmpCast,
12602 N2: DAG.getConstant(Val: 0xff800000, DL: SL, VT: MVT::i32));
12603 Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: TmpCast);
12604 Quot = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f32, N1: Tmp, N2: Quot, Flags: Op->getFlags());
12605 SDValue RDst = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot,
12606 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
12607 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: RDst, N2: RHS, N3: LHS,
12608 Flags: Op->getFlags());
12609}
12610
12611// Faster 2.5 ULP division that does not support denormals.
12612SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12613 SDNodeFlags Flags = Op->getFlags();
12614 SDLoc SL(Op);
12615 SDValue LHS = Op.getOperand(i: 1);
12616 SDValue RHS = Op.getOperand(i: 2);
12617
12618 // TODO: The combiner should probably handle elimination of redundant fabs.
12619 SDValue r1 = DAG.SignBitIsZeroFP(Op: RHS)
12620 ? RHS
12621 : DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags);
12622
12623 const APFloat K0Val(0x1p+96f);
12624 const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32);
12625
12626 const APFloat K1Val(0x1p-32f);
12627 const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32);
12628
12629 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
12630
12631 EVT SetCCVT =
12632 getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32);
12633
12634 SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
12635
12636 SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags);
12637
12638 r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags);
12639
12640 // rcp does not support denormals.
12641 SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags);
12642
12643 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags);
12644
12645 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags);
12646}
12647
12648// Returns immediate value for setting the F32 denorm mode when using the
12649// S_DENORM_MODE instruction.
12650static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
12651 const SIMachineFunctionInfo *Info,
12652 const GCNSubtarget *ST) {
12653 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12654 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12655 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12656 return DAG.getTargetConstant(Val: Mode, DL: SDLoc(), VT: MVT::i32);
12657}
12658
12659SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12660 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12661 return FastLowered;
12662
12663 // The selection matcher assumes anything with a chain selecting to a
12664 // mayRaiseFPException machine instruction. Since we're introducing a chain
12665 // here, we need to explicitly report nofpexcept for the regular fdiv
12666 // lowering.
12667 SDNodeFlags Flags = Op->getFlags();
12668 Flags.setNoFPExcept(true);
12669
12670 SDLoc SL(Op);
12671 SDValue LHS = Op.getOperand(i: 0);
12672 SDValue RHS = Op.getOperand(i: 1);
12673
12674 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
12675
12676 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1);
12677
12678 SDValue DenominatorScaled =
12679 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {RHS, RHS, LHS}, Flags);
12680 SDValue NumeratorScaled =
12681 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {LHS, RHS, LHS}, Flags);
12682
12683 // Denominator is scaled to not be denormal, so using rcp is ok.
12684 SDValue ApproxRcp =
12685 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
12686 SDValue NegDivScale0 =
12687 DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
12688
12689 using namespace AMDGPU::Hwreg;
12690 const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: 4, Values: 2);
12691 const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32);
12692
12693 const MachineFunction &MF = DAG.getMachineFunction();
12694 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12695 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12696
12697 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12698 const bool HasDynamicDenormals =
12699 (DenormMode.Input == DenormalMode::Dynamic) ||
12700 (DenormMode.Output == DenormalMode::Dynamic);
12701
12702 SDValue SavedDenormMode;
12703
12704 if (!PreservesDenormals) {
12705 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12706 // lowering. The chain dependence is insufficient, and we need glue. We do
12707 // not need the glue variants in a strictfp function.
12708
12709 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
12710
12711 SDValue Glue = DAG.getEntryNode();
12712 if (HasDynamicDenormals) {
12713 SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL,
12714 VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue),
12715 Ops: {BitField, Glue});
12716 SavedDenormMode = SDValue(GetReg, 0);
12717
12718 Glue = DAG.getMergeValues(
12719 Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL);
12720 }
12721
12722 SDNode *EnableDenorm;
12723 if (Subtarget->hasDenormModeInst()) {
12724 const SDValue EnableDenormValue =
12725 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
12726
12727 EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
12728 N2: EnableDenormValue)
12729 .getNode();
12730 } else {
12731 const SDValue EnableDenormValue =
12732 DAG.getConstant(FP_DENORM_FLUSH_NONE, DL: SL, VT: MVT::i32);
12733 EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs,
12734 Ops: {EnableDenormValue, BitField, Glue});
12735 }
12736
12737 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12738 SDValue(EnableDenorm, 1)};
12739
12740 NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
12741 }
12742
12743 SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0,
12744 B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags);
12745
12746 SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp,
12747 C: ApproxRcp, GlueChain: Fma0, Flags);
12748
12749 SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled, B: Fma1,
12750 GlueChain: Fma1, Flags);
12751
12752 SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul,
12753 C: NumeratorScaled, GlueChain: Mul, Flags);
12754
12755 SDValue Fma3 =
12756 getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags);
12757
12758 SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3,
12759 C: NumeratorScaled, GlueChain: Fma3, Flags);
12760
12761 if (!PreservesDenormals) {
12762 SDNode *DisableDenorm;
12763 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12764 const SDValue DisableDenormValue = getSPDenormModeValue(
12765 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
12766
12767 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
12768 DisableDenorm =
12769 DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs,
12770 N1: Fma4.getValue(R: 1), N2: DisableDenormValue, N3: Fma4.getValue(R: 2))
12771 .getNode();
12772 } else {
12773 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12774 const SDValue DisableDenormValue =
12775 HasDynamicDenormals
12776 ? SavedDenormMode
12777 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32);
12778
12779 DisableDenorm = DAG.getMachineNode(
12780 Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other,
12781 Ops: {DisableDenormValue, BitField, Fma4.getValue(R: 1), Fma4.getValue(R: 2)});
12782 }
12783
12784 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
12785 N1: SDValue(DisableDenorm, 0), N2: DAG.getRoot());
12786 DAG.setRoot(OutputChain);
12787 }
12788
12789 SDValue Scale = NumeratorScaled.getValue(R: 1);
12790 SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32,
12791 Ops: {Fma4, Fma1, Fma3, Scale}, Flags);
12792
12793 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags);
12794}
12795
12796SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12797 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12798 return FastLowered;
12799
12800 SDLoc SL(Op);
12801 SDValue X = Op.getOperand(i: 0);
12802 SDValue Y = Op.getOperand(i: 1);
12803
12804 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
12805
12806 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1);
12807
12808 SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
12809
12810 SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0);
12811
12812 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0);
12813
12814 SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One);
12815
12816 SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp);
12817
12818 SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One);
12819
12820 SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
12821
12822 SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1);
12823 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3);
12824
12825 SDValue Fma4 =
12826 DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Mul, N3: DivScale1);
12827
12828 SDValue Scale;
12829
12830 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12831 // Workaround a hardware bug on SI where the condition output from div_scale
12832 // is not usable.
12833
12834 const SDValue Hi = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
12835
12836 // Figure out if the scale to use for div_fmas.
12837 SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X);
12838 SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y);
12839 SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0);
12840 SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1);
12841
12842 SDValue NumHi =
12843 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi);
12844 SDValue DenHi =
12845 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi);
12846
12847 SDValue Scale0Hi =
12848 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi);
12849 SDValue Scale1Hi =
12850 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi);
12851
12852 SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ);
12853 SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ);
12854 Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen);
12855 } else {
12856 Scale = DivScale1.getValue(R: 1);
12857 }
12858
12859 SDValue Fmas =
12860 DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64, N1: Fma4, N2: Fma3, N3: Mul, N4: Scale);
12861
12862 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X);
12863}
12864
12865SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12866 EVT VT = Op.getValueType();
12867
12868 if (VT == MVT::f32)
12869 return LowerFDIV32(Op, DAG);
12870
12871 if (VT == MVT::f64)
12872 return LowerFDIV64(Op, DAG);
12873
12874 if (VT == MVT::f16 || VT == MVT::bf16)
12875 return LowerFDIV16(Op, DAG);
12876
12877 llvm_unreachable("Unexpected type for fdiv");
12878}
12879
12880SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12881 SDLoc dl(Op);
12882 SDValue Val = Op.getOperand(i: 0);
12883 EVT VT = Val.getValueType();
12884 EVT ResultExpVT = Op->getValueType(ResNo: 1);
12885 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12886
12887 SDValue Mant = DAG.getNode(
12888 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
12889 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val);
12890
12891 SDValue Exp = DAG.getNode(
12892 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT,
12893 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val);
12894
12895 if (Subtarget->hasFractBug()) {
12896 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
12897 SDValue Inf =
12898 DAG.getConstantFP(Val: APFloat::getInf(Sem: VT.getFltSemantics()), DL: dl, VT);
12899
12900 SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT);
12901 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT);
12902 Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
12903 Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
12904 }
12905
12906 SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
12907 return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
12908}
12909
12910SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12911 SDLoc DL(Op);
12912 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
12913 EVT VT = Store->getMemoryVT();
12914
12915 if (VT == MVT::i1) {
12916 return DAG.getTruncStore(
12917 Chain: Store->getChain(), dl: DL,
12918 Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32),
12919 Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand());
12920 }
12921
12922 assert(VT.isVector() &&
12923 Store->getValue().getValueType().getScalarType() == MVT::i32);
12924
12925 unsigned AS = Store->getAddressSpace();
12926 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12927 AS == AMDGPUAS::FLAT_ADDRESS &&
12928 Store->getAlign().value() < VT.getStoreSize() &&
12929 VT.getSizeInBits() > 32) {
12930 return SplitVectorStore(Op, DAG);
12931 }
12932
12933 MachineFunction &MF = DAG.getMachineFunction();
12934 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12935 // If there is a possibility that flat instruction access scratch memory
12936 // then we need to use the same legalization rules we use for private.
12937 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12938 !Subtarget->hasMultiDwordFlatScratchAddressing())
12939 AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI)
12940 ? AMDGPUAS::PRIVATE_ADDRESS
12941 : AMDGPUAS::GLOBAL_ADDRESS;
12942
12943 unsigned NumElements = VT.getVectorNumElements();
12944 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
12945 if (NumElements > 4)
12946 return SplitVectorStore(Op, DAG);
12947 // v3 stores not supported on SI.
12948 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12949 return SplitVectorStore(Op, DAG);
12950
12951 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
12952 VT, MMO: *Store->getMemOperand()))
12953 return expandUnalignedStore(ST: Store, DAG);
12954
12955 return SDValue();
12956 }
12957 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12958 switch (Subtarget->getMaxPrivateElementSize()) {
12959 case 4:
12960 return scalarizeVectorStore(ST: Store, DAG);
12961 case 8:
12962 if (NumElements > 2)
12963 return SplitVectorStore(Op, DAG);
12964 return SDValue();
12965 case 16:
12966 if (NumElements > 4 ||
12967 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
12968 return SplitVectorStore(Op, DAG);
12969 return SDValue();
12970 default:
12971 llvm_unreachable("unsupported private_element_size");
12972 }
12973 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12974 unsigned Fast = 0;
12975 auto Flags = Store->getMemOperand()->getFlags();
12976 if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
12977 Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
12978 Fast > 1)
12979 return SDValue();
12980
12981 if (VT.isVector())
12982 return SplitVectorStore(Op, DAG);
12983
12984 return expandUnalignedStore(ST: Store, DAG);
12985 }
12986
12987 // Probably an invalid store. If so we'll end up emitting a selection error.
12988 return SDValue();
12989}
12990
12991// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12992SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12993 SDLoc SL(Op);
12994 assert(!Subtarget->has16BitInsts());
12995 SDNodeFlags Flags = Op->getFlags();
12996 SDValue Ext =
12997 DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: 0), Flags);
12998
12999 SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32);
13000 SDValue Sqrt =
13001 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags);
13002
13003 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt,
13004 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
13005}
13006
13007SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13008 SDLoc DL(Op);
13009 SDNodeFlags Flags = Op->getFlags();
13010 MVT VT = Op.getValueType().getSimpleVT();
13011 const SDValue X = Op.getOperand(i: 0);
13012
13013 if (allowApproxFunc(DAG, Flags)) {
13014 // Instruction is 1ulp but ignores denormals.
13015 return DAG.getNode(
13016 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
13017 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags);
13018 }
13019
13020 SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT);
13021 SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT);
13022
13023 SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT);
13024
13025 SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
13026
13027 SDValue SqrtX =
13028 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
13029
13030 SDValue SqrtS;
13031 if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
13032 SDValue SqrtID =
13033 DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32);
13034 SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
13035
13036 SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS);
13037 SDValue SqrtSNextDownInt =
13038 DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13039 N2: DAG.getAllOnesConstant(DL, VT: MVT::i32));
13040 SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
13041
13042 SDValue NegSqrtSNextDown =
13043 DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
13044
13045 SDValue SqrtVP =
13046 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
13047
13048 SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13049 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
13050 SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
13051
13052 SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
13053 SDValue SqrtVS =
13054 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
13055
13056 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
13057 SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE);
13058
13059 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
13060 Flags);
13061
13062 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT);
13063 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
13064 Flags);
13065 } else {
13066 SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
13067
13068 SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
13069
13070 SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT);
13071 SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
13072 SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
13073
13074 SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
13075 SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
13076 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
13077
13078 SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
13079 SDValue SqrtD =
13080 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
13081 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
13082 }
13083
13084 SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT);
13085
13086 SDValue ScaledDown =
13087 DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
13088
13089 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
13090 SDValue IsZeroOrInf =
13091 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13092 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
13093
13094 return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
13095}
13096
13097SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13098 // For double type, the SQRT and RSQ instructions don't have required
13099 // precision, we apply Goldschmidt's algorithm to improve the result:
13100 //
13101 // y0 = rsq(x)
13102 // g0 = x * y0
13103 // h0 = 0.5 * y0
13104 //
13105 // r0 = 0.5 - h0 * g0
13106 // g1 = g0 * r0 + g0
13107 // h1 = h0 * r0 + h0
13108 //
13109 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13110 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13111 // h2 = h1 * r1 + h1
13112 //
13113 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13114 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13115 //
13116 // sqrt(x) = g3
13117
13118 SDNodeFlags Flags = Op->getFlags();
13119
13120 SDLoc DL(Op);
13121
13122 SDValue X = Op.getOperand(i: 0);
13123 SDValue ScaleConstant = DAG.getConstantFP(Val: 0x1.0p-767, DL, VT: MVT::f64);
13124
13125 SDValue Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT);
13126
13127 SDValue ZeroInt = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
13128
13129 // Scale up input if it is too small.
13130 SDValue ScaleUpFactor = DAG.getConstant(Val: 256, DL, VT: MVT::i32);
13131 SDValue ScaleUp =
13132 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt);
13133 SDValue SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags);
13134
13135 SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX);
13136
13137 SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY);
13138
13139 SDValue Half = DAG.getConstantFP(Val: 0.5, DL, VT: MVT::f64);
13140 SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half);
13141
13142 SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0);
13143 SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half);
13144
13145 SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0);
13146
13147 SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0);
13148
13149 SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1);
13150 SDValue SqrtD0 =
13151 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX);
13152
13153 SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1);
13154
13155 SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2);
13156 SDValue SqrtD1 =
13157 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX);
13158
13159 SDValue SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2);
13160
13161 SDValue ScaleDownFactor = DAG.getSignedConstant(Val: -128, DL, VT: MVT::i32);
13162 SDValue ScaleDown =
13163 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleDownFactor, N3: ZeroInt);
13164 SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags);
13165
13166 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
13167 // with finite only or nsz because rsq(+/-0) = +/-inf
13168
13169 // TODO: Check for DAZ and expand to subnormals
13170 SDValue IsZeroOrInf =
13171 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13172 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
13173
13174 // If x is +INF, +0, or -0, use its original value
13175 return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet,
13176 Flags);
13177}
13178
13179SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13180 SDLoc DL(Op);
13181 EVT VT = Op.getValueType();
13182 SDValue Arg = Op.getOperand(i: 0);
13183 SDValue TrigVal;
13184
13185 // Propagate fast-math flags so that the multiply we introduce can be folded
13186 // if Arg is already the result of a multiply by constant.
13187 auto Flags = Op->getFlags();
13188
13189 // AMDGPUISD nodes of vector type must be unrolled here since
13190 // they will not be expanded elsewhere.
13191 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13192 if (!V.getValueType().isVector())
13193 return V;
13194
13195 return DAG.UnrollVectorOp(N: cast<SDNode>(Val&: V));
13196 };
13197
13198 SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT);
13199
13200 if (Subtarget->hasTrigReducedRange()) {
13201 SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
13202 TrigVal = UnrollIfVec(DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags));
13203 } else {
13204 TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
13205 }
13206
13207 switch (Op.getOpcode()) {
13208 case ISD::FCOS:
13209 TrigVal = DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
13210 break;
13211 case ISD::FSIN:
13212 TrigVal = DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
13213 break;
13214 default:
13215 llvm_unreachable("Wrong trig opcode");
13216 }
13217
13218 return UnrollIfVec(TrigVal);
13219}
13220
13221SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13222 SelectionDAG &DAG) const {
13223 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
13224 assert(AtomicNode->isCompareAndSwap());
13225 unsigned AS = AtomicNode->getAddressSpace();
13226
13227 // No custom lowering required for local address space
13228 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
13229 return Op;
13230
13231 // Non-local address space requires custom lowering for atomic compare
13232 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13233 SDLoc DL(Op);
13234 SDValue ChainIn = Op.getOperand(i: 0);
13235 SDValue Addr = Op.getOperand(i: 1);
13236 SDValue Old = Op.getOperand(i: 2);
13237 SDValue New = Op.getOperand(i: 3);
13238 EVT VT = Op.getValueType();
13239 MVT SimpleVT = VT.getSimpleVT();
13240 MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2);
13241
13242 SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
13243 SDValue Ops[] = {ChainIn, Addr, NewOld};
13244
13245 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL,
13246 VTList: Op->getVTList(), Ops, MemVT: VT,
13247 MMO: AtomicNode->getMemOperand());
13248}
13249
13250//===----------------------------------------------------------------------===//
13251// Custom DAG optimizations
13252//===----------------------------------------------------------------------===//
13253
13254SDValue
13255SITargetLowering::performUCharToFloatCombine(SDNode *N,
13256 DAGCombinerInfo &DCI) const {
13257 EVT VT = N->getValueType(ResNo: 0);
13258 EVT ScalarVT = VT.getScalarType();
13259 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13260 return SDValue();
13261
13262 SelectionDAG &DAG = DCI.DAG;
13263 SDLoc DL(N);
13264
13265 SDValue Src = N->getOperand(Num: 0);
13266 EVT SrcVT = Src.getValueType();
13267
13268 // TODO: We could try to match extracting the higher bytes, which would be
13269 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13270 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13271 // about in practice.
13272 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13273 if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) {
13274 SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src);
13275 DCI.AddToWorklist(N: Cvt.getNode());
13276
13277 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13278 if (ScalarVT != MVT::f32) {
13279 Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt,
13280 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
13281 }
13282 return Cvt;
13283 }
13284 }
13285
13286 return SDValue();
13287}
13288
13289SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13290 DAGCombinerInfo &DCI) const {
13291 SDValue MagnitudeOp = N->getOperand(Num: 0);
13292 SDValue SignOp = N->getOperand(Num: 1);
13293
13294 // The generic combine for fcopysign + fp cast is too conservative with
13295 // vectors, and also gets confused by the splitting we will perform here, so
13296 // peek through FP casts.
13297 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13298 SignOp.getOpcode() == ISD::FP_ROUND)
13299 SignOp = SignOp.getOperand(i: 0);
13300
13301 SelectionDAG &DAG = DCI.DAG;
13302 SDLoc DL(N);
13303 EVT SignVT = SignOp.getValueType();
13304
13305 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13306 // lower half with a copy.
13307 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13308 EVT MagVT = MagnitudeOp.getValueType();
13309
13310 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13311
13312 if (MagVT.getScalarType() == MVT::f64) {
13313 EVT F32VT = MagVT.isVector()
13314 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
13315 : MVT::v2f32;
13316
13317 SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: MagnitudeOp);
13318
13319 SmallVector<SDValue, 8> NewElts;
13320 for (unsigned I = 0; I != NumElts; ++I) {
13321 SDValue MagLo =
13322 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
13323 N2: DAG.getConstant(Val: 2 * I, DL, VT: MVT::i32));
13324 SDValue MagHi =
13325 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
13326 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
13327
13328 SDValue SignOpElt =
13329 MagVT.isVector()
13330 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: SignVT.getScalarType(),
13331 N1: SignOp, N2: DAG.getConstant(Val: I, DL, VT: MVT::i32))
13332 : SignOp;
13333
13334 SDValue HiOp =
13335 DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOpElt);
13336
13337 SDValue Vector =
13338 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp);
13339
13340 SDValue NewElt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector);
13341 NewElts.push_back(Elt: NewElt);
13342 }
13343
13344 if (NewElts.size() == 1)
13345 return NewElts[0];
13346
13347 return DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MagVT, Ops: NewElts);
13348 }
13349
13350 if (SignVT.getScalarType() != MVT::f64)
13351 return SDValue();
13352
13353 // Reduce width of sign operand, we only need the highest bit.
13354 //
13355 // fcopysign f64:x, f64:y ->
13356 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13357 // TODO: In some cases it might make sense to go all the way to f16.
13358
13359 EVT F32VT = MagVT.isVector()
13360 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
13361 : MVT::v2f32;
13362
13363 SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: SignOp);
13364
13365 SmallVector<SDValue, 8> F32Signs;
13366 for (unsigned I = 0; I != NumElts; ++I) {
13367 // Take sign from odd elements of cast vector
13368 SDValue SignAsF32 =
13369 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector,
13370 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
13371 F32Signs.push_back(Elt: SignAsF32);
13372 }
13373
13374 SDValue NewSign =
13375 NumElts == 1
13376 ? F32Signs.back()
13377 : DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL,
13378 VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: NumElts),
13379 Ops: F32Signs);
13380
13381 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
13382 N2: NewSign);
13383}
13384
13385// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13386// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13387// bits
13388
13389// This is a variant of
13390// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13391//
13392// The normal DAG combiner will do this, but only if the add has one use since
13393// that would increase the number of instructions.
13394//
13395// This prevents us from seeing a constant offset that can be folded into a
13396// memory instruction's addressing mode. If we know the resulting add offset of
13397// a pointer can be folded into an addressing offset, we can replace the pointer
13398// operand with the add of new constant offset. This eliminates one of the uses,
13399// and may allow the remaining use to also be simplified.
13400//
13401SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13402 EVT MemVT,
13403 DAGCombinerInfo &DCI) const {
13404 SDValue N0 = N->getOperand(Num: 0);
13405 SDValue N1 = N->getOperand(Num: 1);
13406
13407 // We only do this to handle cases where it's profitable when there are
13408 // multiple uses of the add, so defer to the standard combine.
13409 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13410 return SDValue();
13411
13412 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
13413 if (!CN1)
13414 return SDValue();
13415
13416 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
13417 if (!CAdd)
13418 return SDValue();
13419
13420 SelectionDAG &DAG = DCI.DAG;
13421
13422 if (N0->getOpcode() == ISD::OR &&
13423 !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1)))
13424 return SDValue();
13425
13426 // If the resulting offset is too large, we can't fold it into the
13427 // addressing mode offset.
13428 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13429 Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext());
13430
13431 AddrMode AM;
13432 AM.HasBaseReg = true;
13433 AM.BaseOffs = Offset.getSExtValue();
13434 if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
13435 return SDValue();
13436
13437 SDLoc SL(N);
13438 EVT VT = N->getValueType(ResNo: 0);
13439
13440 SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1);
13441 SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
13442
13443 SDNodeFlags Flags;
13444 Flags.setNoUnsignedWrap(
13445 N->getFlags().hasNoUnsignedWrap() &&
13446 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13447
13448 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13449 // be sure that the new left operand is a proper base pointer.
13450 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
13451}
13452
13453/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13454/// by the chain and intrinsic ID. Theoretically we would also need to check the
13455/// specific intrinsic, but they all place the pointer operand first.
13456static unsigned getBasePtrIndex(const MemSDNode *N) {
13457 switch (N->getOpcode()) {
13458 case ISD::STORE:
13459 case ISD::INTRINSIC_W_CHAIN:
13460 case ISD::INTRINSIC_VOID:
13461 return 2;
13462 default:
13463 return 1;
13464 }
13465}
13466
13467SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13468 DAGCombinerInfo &DCI) const {
13469 SelectionDAG &DAG = DCI.DAG;
13470
13471 unsigned PtrIdx = getBasePtrIndex(N);
13472 SDValue Ptr = N->getOperand(Num: PtrIdx);
13473
13474 // TODO: We could also do this for multiplies.
13475 if (Ptr.getOpcode() == ISD::SHL) {
13476 SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
13477 MemVT: N->getMemoryVT(), DCI);
13478 if (NewPtr) {
13479 SmallVector<SDValue, 8> NewOps(N->ops());
13480
13481 NewOps[PtrIdx] = NewPtr;
13482 return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0);
13483 }
13484 }
13485
13486 return SDValue();
13487}
13488
13489static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13490 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13491 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13492 (Opc == ISD::XOR && Val == 0);
13493}
13494
13495// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13496// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13497// integer combine opportunities since most 64-bit operations are decomposed
13498// this way. TODO: We won't want this for SALU especially if it is an inline
13499// immediate.
13500SDValue SITargetLowering::splitBinaryBitConstantOp(
13501 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13502 const ConstantSDNode *CRHS) const {
13503 uint64_t Val = CRHS->getZExtValue();
13504 uint32_t ValLo = Lo_32(Value: Val);
13505 uint32_t ValHi = Hi_32(Value: Val);
13506 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13507
13508 if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) ||
13509 bitOpWithConstantIsReducible(Opc, Val: ValHi)) ||
13510 (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
13511 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13512 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13513 !CRHS->user_begin()->isDivergent())
13514 return SDValue();
13515
13516 // If we need to materialize a 64-bit immediate, it will be split up later
13517 // anyway. Avoid creating the harder to understand 64-bit immediate
13518 // materialization.
13519 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13520 }
13521
13522 return SDValue();
13523}
13524
13525bool llvm::isBoolSGPR(SDValue V) {
13526 if (V.getValueType() != MVT::i1)
13527 return false;
13528 switch (V.getOpcode()) {
13529 default:
13530 break;
13531 case ISD::SETCC:
13532 case ISD::IS_FPCLASS:
13533 case AMDGPUISD::FP_CLASS:
13534 return true;
13535 case ISD::AND:
13536 case ISD::OR:
13537 case ISD::XOR:
13538 return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1));
13539 case ISD::SADDO:
13540 case ISD::UADDO:
13541 case ISD::SSUBO:
13542 case ISD::USUBO:
13543 case ISD::SMULO:
13544 case ISD::UMULO:
13545 return V.getResNo() == 1;
13546 case ISD::INTRINSIC_WO_CHAIN: {
13547 unsigned IntrinsicID = V.getConstantOperandVal(i: 0);
13548 switch (IntrinsicID) {
13549 case Intrinsic::amdgcn_is_shared:
13550 case Intrinsic::amdgcn_is_private:
13551 return true;
13552 default:
13553 return false;
13554 }
13555
13556 return false;
13557 }
13558 }
13559 return false;
13560}
13561
13562// If a constant has all zeroes or all ones within each byte return it.
13563// Otherwise return 0.
13564static uint32_t getConstantPermuteMask(uint32_t C) {
13565 // 0xff for any zero byte in the mask
13566 uint32_t ZeroByteMask = 0;
13567 if (!(C & 0x000000ff))
13568 ZeroByteMask |= 0x000000ff;
13569 if (!(C & 0x0000ff00))
13570 ZeroByteMask |= 0x0000ff00;
13571 if (!(C & 0x00ff0000))
13572 ZeroByteMask |= 0x00ff0000;
13573 if (!(C & 0xff000000))
13574 ZeroByteMask |= 0xff000000;
13575 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13576 if ((NonZeroByteMask & C) != NonZeroByteMask)
13577 return 0; // Partial bytes selected.
13578 return C;
13579}
13580
13581// Check if a node selects whole bytes from its operand 0 starting at a byte
13582// boundary while masking the rest. Returns select mask as in the v_perm_b32
13583// or -1 if not succeeded.
13584// Note byte select encoding:
13585// value 0-3 selects corresponding source byte;
13586// value 0xc selects zero;
13587// value 0xff selects 0xff.
13588static uint32_t getPermuteMask(SDValue V) {
13589 assert(V.getValueSizeInBits() == 32);
13590
13591 if (V.getNumOperands() != 2)
13592 return ~0;
13593
13594 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1));
13595 if (!N1)
13596 return ~0;
13597
13598 uint32_t C = N1->getZExtValue();
13599
13600 switch (V.getOpcode()) {
13601 default:
13602 break;
13603 case ISD::AND:
13604 if (uint32_t ConstMask = getConstantPermuteMask(C))
13605 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13606 break;
13607
13608 case ISD::OR:
13609 if (uint32_t ConstMask = getConstantPermuteMask(C))
13610 return (0x03020100 & ~ConstMask) | ConstMask;
13611 break;
13612
13613 case ISD::SHL:
13614 if (C % 8)
13615 return ~0;
13616
13617 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13618
13619 case ISD::SRL:
13620 if (C % 8)
13621 return ~0;
13622
13623 return uint32_t(0x0c0c0c0c03020100ull >> C);
13624 }
13625
13626 return ~0;
13627}
13628
13629SDValue SITargetLowering::performAndCombine(SDNode *N,
13630 DAGCombinerInfo &DCI) const {
13631 if (DCI.isBeforeLegalize())
13632 return SDValue();
13633
13634 SelectionDAG &DAG = DCI.DAG;
13635 EVT VT = N->getValueType(ResNo: 0);
13636 SDValue LHS = N->getOperand(Num: 0);
13637 SDValue RHS = N->getOperand(Num: 1);
13638
13639 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
13640 if (VT == MVT::i64 && CRHS) {
13641 if (SDValue Split =
13642 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS))
13643 return Split;
13644 }
13645
13646 if (CRHS && VT == MVT::i32) {
13647 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13648 // nb = number of trailing zeroes in mask
13649 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13650 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13651 uint64_t Mask = CRHS->getZExtValue();
13652 unsigned Bits = llvm::popcount(Value: Mask);
13653 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13654 (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) {
13655 if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) {
13656 unsigned Shift = CShift->getZExtValue();
13657 unsigned NB = CRHS->getAPIntValue().countr_zero();
13658 unsigned Offset = NB + Shift;
13659 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13660 SDLoc SL(N);
13661 SDValue BFE =
13662 DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32, N1: LHS->getOperand(Num: 0),
13663 N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32),
13664 N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32));
13665 EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
13666 SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
13667 N2: DAG.getValueType(NarrowVT));
13668 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc(LHS), VT, N1: Ext,
13669 N2: DAG.getConstant(Val: NB, DL: SDLoc(CRHS), VT: MVT::i32));
13670 return Shl;
13671 }
13672 }
13673 }
13674
13675 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13676 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13677 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
13678 uint32_t Sel = getConstantPermuteMask(C: Mask);
13679 if (!Sel)
13680 return SDValue();
13681
13682 // Select 0xc for all zero bytes
13683 Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c);
13684 SDLoc DL(N);
13685 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
13686 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13687 }
13688 }
13689
13690 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13691 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13692 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13693 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
13694 ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get();
13695
13696 SDValue X = LHS.getOperand(i: 0);
13697 SDValue Y = RHS.getOperand(i: 0);
13698 if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X ||
13699 !isTypeLegal(VT: X.getValueType()))
13700 return SDValue();
13701
13702 if (LCC == ISD::SETO) {
13703 if (X != LHS.getOperand(i: 1))
13704 return SDValue();
13705
13706 if (RCC == ISD::SETUNE) {
13707 const ConstantFPSDNode *C1 =
13708 dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1));
13709 if (!C1 || !C1->isInfinity() || C1->isNegative())
13710 return SDValue();
13711
13712 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13713 SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |
13714 SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |
13715 SIInstrFlags::P_NORMAL;
13716
13717 static_assert(
13718 ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |
13719 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &
13720 0x3ff) == Mask,
13721 "mask not equal");
13722
13723 SDLoc DL(N);
13724 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: X,
13725 N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32));
13726 }
13727 }
13728 }
13729
13730 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13731 std::swap(a&: LHS, b&: RHS);
13732
13733 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13734 RHS.hasOneUse()) {
13735 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
13736 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13737 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13738 // | n_nan)
13739 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
13740 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13741 (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) &&
13742 LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) {
13743 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13744 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13745 : Mask->getZExtValue() & OrdMask;
13746
13747 SDLoc DL(N);
13748 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: 0),
13749 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
13750 }
13751 }
13752
13753 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13754 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13755 // and x, (sext cc from i1) => select cc, x, 0
13756 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13757 std::swap(a&: LHS, b&: RHS);
13758 if (isBoolSGPR(V: RHS.getOperand(i: 0)))
13759 return DAG.getSelect(DL: SDLoc(N), VT: MVT::i32, Cond: RHS.getOperand(i: 0), LHS,
13760 RHS: DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32));
13761 }
13762
13763 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13764 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13765 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13766 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
13767 uint32_t LHSMask = getPermuteMask(V: LHS);
13768 uint32_t RHSMask = getPermuteMask(V: RHS);
13769 if (LHSMask != ~0u && RHSMask != ~0u) {
13770 // Canonicalize the expression in an attempt to have fewer unique masks
13771 // and therefore fewer registers used to hold the masks.
13772 if (LHSMask > RHSMask) {
13773 std::swap(a&: LHSMask, b&: RHSMask);
13774 std::swap(a&: LHS, b&: RHS);
13775 }
13776
13777 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13778 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13779 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13780 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13781
13782 // Check of we need to combine values from two sources within a byte.
13783 if (!(LHSUsedLanes & RHSUsedLanes) &&
13784 // If we select high and lower word keep it for SDWA.
13785 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13786 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13787 // Each byte in each mask is either selector mask 0-3, or has higher
13788 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13789 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13790 // mask which is not 0xff wins. By anding both masks we have a correct
13791 // result except that 0x0c shall be corrected to give 0x0c only.
13792 uint32_t Mask = LHSMask & RHSMask;
13793 for (unsigned I = 0; I < 32; I += 8) {
13794 uint32_t ByteSel = 0xff << I;
13795 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13796 Mask &= (0x0c << I) & 0xffffffff;
13797 }
13798
13799 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13800 // or 0x0c.
13801 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13802 SDLoc DL(N);
13803
13804 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
13805 N2: RHS.getOperand(i: 0),
13806 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13807 }
13808 }
13809 }
13810
13811 return SDValue();
13812}
13813
13814// A key component of v_perm is a mapping between byte position of the src
13815// operands, and the byte position of the dest. To provide such, we need: 1. the
13816// node that provides x byte of the dest of the OR, and 2. the byte of the node
13817// used to provide that x byte. calculateByteProvider finds which node provides
13818// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13819// and finds an ultimate src and byte position For example: The supported
13820// LoadCombine pattern for vector loads is as follows
13821// t1
13822// or
13823// / \
13824// t2 t3
13825// zext shl
13826// | | \
13827// t4 t5 16
13828// or anyext
13829// / \ |
13830// t6 t7 t8
13831// srl shl or
13832// / | / \ / \
13833// t9 t10 t11 t12 t13 t14
13834// trunc* 8 trunc* 8 and and
13835// | | / | | \
13836// t15 t16 t17 t18 t19 t20
13837// trunc* 255 srl -256
13838// | / \
13839// t15 t15 16
13840//
13841// *In this example, the truncs are from i32->i16
13842//
13843// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13844// respectively. calculateSrcByte would find (given node) -> ultimate src &
13845// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13846// After finding the mapping, we can combine the tree into vperm t15, t16,
13847// 0x05000407
13848
13849// Find the source and byte position from a node.
13850// \p DestByte is the byte position of the dest of the or that the src
13851// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13852// dest of the or byte. \p Depth tracks how many recursive iterations we have
13853// performed.
13854static const std::optional<ByteProvider<SDValue>>
13855calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13856 unsigned Depth = 0) {
13857 // We may need to recursively traverse a series of SRLs
13858 if (Depth >= 6)
13859 return std::nullopt;
13860
13861 if (Op.getValueSizeInBits() < 8)
13862 return std::nullopt;
13863
13864 if (Op.getValueType().isVector())
13865 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
13866
13867 switch (Op->getOpcode()) {
13868 case ISD::TRUNCATE: {
13869 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13870 }
13871
13872 case ISD::ANY_EXTEND:
13873 case ISD::SIGN_EXTEND:
13874 case ISD::ZERO_EXTEND:
13875 case ISD::SIGN_EXTEND_INREG: {
13876 SDValue NarrowOp = Op->getOperand(Num: 0);
13877 auto NarrowVT = NarrowOp.getValueType();
13878 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13879 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
13880 NarrowVT = VTSign->getVT();
13881 }
13882 if (!NarrowVT.isByteSized())
13883 return std::nullopt;
13884 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13885
13886 if (SrcIndex >= NarrowByteWidth)
13887 return std::nullopt;
13888 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13889 }
13890
13891 case ISD::SRA:
13892 case ISD::SRL: {
13893 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
13894 if (!ShiftOp)
13895 return std::nullopt;
13896
13897 uint64_t BitShift = ShiftOp->getZExtValue();
13898
13899 if (BitShift % 8 != 0)
13900 return std::nullopt;
13901
13902 SrcIndex += BitShift / 8;
13903
13904 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13905 }
13906
13907 default: {
13908 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
13909 }
13910 }
13911 llvm_unreachable("fully handled switch");
13912}
13913
13914// For a byte position in the result of an Or, traverse the tree and find the
13915// node (and the byte of the node) which ultimately provides this {Or,
13916// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13917// the byte position of the Op that corresponds with the originally requested
13918// byte of the Or \p Depth tracks how many recursive iterations we have
13919// performed. \p StartingIndex is the originally requested byte of the Or
13920static const std::optional<ByteProvider<SDValue>>
13921calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13922 unsigned StartingIndex = 0) {
13923 // Finding Src tree of RHS of or typically requires at least 1 additional
13924 // depth
13925 if (Depth > 6)
13926 return std::nullopt;
13927
13928 unsigned BitWidth = Op.getScalarValueSizeInBits();
13929 if (BitWidth % 8 != 0)
13930 return std::nullopt;
13931 if (Index > BitWidth / 8 - 1)
13932 return std::nullopt;
13933
13934 bool IsVec = Op.getValueType().isVector();
13935 switch (Op.getOpcode()) {
13936 case ISD::OR: {
13937 if (IsVec)
13938 return std::nullopt;
13939
13940 auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1,
13941 StartingIndex);
13942 if (!RHS)
13943 return std::nullopt;
13944 auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
13945 StartingIndex);
13946 if (!LHS)
13947 return std::nullopt;
13948 // A well formed Or will have two ByteProviders for each byte, one of which
13949 // is constant zero
13950 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13951 return std::nullopt;
13952 if (!LHS || LHS->isConstantZero())
13953 return RHS;
13954 if (!RHS || RHS->isConstantZero())
13955 return LHS;
13956 return std::nullopt;
13957 }
13958
13959 case ISD::AND: {
13960 if (IsVec)
13961 return std::nullopt;
13962
13963 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
13964 if (!BitMaskOp)
13965 return std::nullopt;
13966
13967 uint32_t BitMask = BitMaskOp->getZExtValue();
13968 // Bits we expect for our StartingIndex
13969 uint32_t IndexMask = 0xFF << (Index * 8);
13970
13971 if ((IndexMask & BitMask) != IndexMask) {
13972 // If the result of the and partially provides the byte, then it
13973 // is not well formatted
13974 if (IndexMask & BitMask)
13975 return std::nullopt;
13976 return ByteProvider<SDValue>::getConstantZero();
13977 }
13978
13979 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index);
13980 }
13981
13982 case ISD::FSHR: {
13983 if (IsVec)
13984 return std::nullopt;
13985
13986 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13987 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
13988 if (!ShiftOp || Op.getValueType().isVector())
13989 return std::nullopt;
13990
13991 uint64_t BitsProvided = Op.getValueSizeInBits();
13992 if (BitsProvided % 8 != 0)
13993 return std::nullopt;
13994
13995 uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
13996 if (BitShift % 8)
13997 return std::nullopt;
13998
13999 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14000 uint64_t ByteShift = BitShift / 8;
14001
14002 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14003 uint64_t BytesProvided = BitsProvided / 8;
14004 SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1);
14005 NewIndex %= BytesProvided;
14006 return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex);
14007 }
14008
14009 case ISD::SRA:
14010 case ISD::SRL: {
14011 if (IsVec)
14012 return std::nullopt;
14013
14014 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14015 if (!ShiftOp)
14016 return std::nullopt;
14017
14018 uint64_t BitShift = ShiftOp->getZExtValue();
14019 if (BitShift % 8)
14020 return std::nullopt;
14021
14022 auto BitsProvided = Op.getScalarValueSizeInBits();
14023 if (BitsProvided % 8 != 0)
14024 return std::nullopt;
14025
14026 uint64_t BytesProvided = BitsProvided / 8;
14027 uint64_t ByteShift = BitShift / 8;
14028 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14029 // If the byte we are trying to provide (as tracked by index) falls in this
14030 // range, then the SRL provides the byte. The byte of interest of the src of
14031 // the SRL is Index + ByteShift
14032 return BytesProvided - ByteShift > Index
14033 ? calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex,
14034 SrcIndex: Index + ByteShift)
14035 : ByteProvider<SDValue>::getConstantZero();
14036 }
14037
14038 case ISD::SHL: {
14039 if (IsVec)
14040 return std::nullopt;
14041
14042 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14043 if (!ShiftOp)
14044 return std::nullopt;
14045
14046 uint64_t BitShift = ShiftOp->getZExtValue();
14047 if (BitShift % 8 != 0)
14048 return std::nullopt;
14049 uint64_t ByteShift = BitShift / 8;
14050
14051 // If we are shifting by an amount greater than (or equal to)
14052 // the index we are trying to provide, then it provides 0s. If not,
14053 // then this bytes are not definitively 0s, and the corresponding byte
14054 // of interest is Index - ByteShift of the src
14055 return Index < ByteShift
14056 ? ByteProvider<SDValue>::getConstantZero()
14057 : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift,
14058 Depth: Depth + 1, StartingIndex);
14059 }
14060 case ISD::ANY_EXTEND:
14061 case ISD::SIGN_EXTEND:
14062 case ISD::ZERO_EXTEND:
14063 case ISD::SIGN_EXTEND_INREG:
14064 case ISD::AssertZext:
14065 case ISD::AssertSext: {
14066 if (IsVec)
14067 return std::nullopt;
14068
14069 SDValue NarrowOp = Op->getOperand(Num: 0);
14070 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14071 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14072 Op->getOpcode() == ISD::AssertZext ||
14073 Op->getOpcode() == ISD::AssertSext) {
14074 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
14075 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14076 }
14077 if (NarrowBitWidth % 8 != 0)
14078 return std::nullopt;
14079 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14080
14081 if (Index >= NarrowByteWidth)
14082 return Op.getOpcode() == ISD::ZERO_EXTEND
14083 ? std::optional<ByteProvider<SDValue>>(
14084 ByteProvider<SDValue>::getConstantZero())
14085 : std::nullopt;
14086 return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex);
14087 }
14088
14089 case ISD::TRUNCATE: {
14090 if (IsVec)
14091 return std::nullopt;
14092
14093 uint64_t NarrowByteWidth = BitWidth / 8;
14094
14095 if (NarrowByteWidth >= Index) {
14096 return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
14097 StartingIndex);
14098 }
14099
14100 return std::nullopt;
14101 }
14102
14103 case ISD::CopyFromReg: {
14104 if (BitWidth / 8 > Index)
14105 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14106
14107 return std::nullopt;
14108 }
14109
14110 case ISD::LOAD: {
14111 auto *L = cast<LoadSDNode>(Val: Op.getNode());
14112
14113 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14114 if (NarrowBitWidth % 8 != 0)
14115 return std::nullopt;
14116 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14117
14118 // If the width of the load does not reach byte we are trying to provide for
14119 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14120 // question
14121 if (Index >= NarrowByteWidth) {
14122 return L->getExtensionType() == ISD::ZEXTLOAD
14123 ? std::optional<ByteProvider<SDValue>>(
14124 ByteProvider<SDValue>::getConstantZero())
14125 : std::nullopt;
14126 }
14127
14128 if (NarrowByteWidth > Index) {
14129 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14130 }
14131
14132 return std::nullopt;
14133 }
14134
14135 case ISD::BSWAP: {
14136 if (IsVec)
14137 return std::nullopt;
14138
14139 return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1,
14140 Depth: Depth + 1, StartingIndex);
14141 }
14142
14143 case ISD::EXTRACT_VECTOR_ELT: {
14144 auto *IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14145 if (!IdxOp)
14146 return std::nullopt;
14147 auto VecIdx = IdxOp->getZExtValue();
14148 auto ScalarSize = Op.getScalarValueSizeInBits();
14149 if (ScalarSize < 32)
14150 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14151 return calculateSrcByte(Op: ScalarSize >= 32 ? Op : Op.getOperand(i: 0),
14152 DestByte: StartingIndex, SrcIndex: Index);
14153 }
14154
14155 case AMDGPUISD::PERM: {
14156 if (IsVec)
14157 return std::nullopt;
14158
14159 auto *PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
14160 if (!PermMask)
14161 return std::nullopt;
14162
14163 auto IdxMask =
14164 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14165 if (IdxMask > 0x07 && IdxMask != 0x0c)
14166 return std::nullopt;
14167
14168 auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1);
14169 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14170
14171 return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
14172 : ByteProvider<SDValue>(
14173 ByteProvider<SDValue>::getConstantZero());
14174 }
14175
14176 default: {
14177 return std::nullopt;
14178 }
14179 }
14180
14181 llvm_unreachable("fully handled switch");
14182}
14183
14184// Returns true if the Operand is a scalar and is 16 bits
14185static bool isExtendedFrom16Bits(SDValue &Operand) {
14186
14187 switch (Operand.getOpcode()) {
14188 case ISD::ANY_EXTEND:
14189 case ISD::SIGN_EXTEND:
14190 case ISD::ZERO_EXTEND: {
14191 auto OpVT = Operand.getOperand(i: 0).getValueType();
14192 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14193 }
14194 case ISD::LOAD: {
14195 LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
14196 auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
14197 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14198 ExtType == ISD::EXTLOAD) {
14199 auto MemVT = L->getMemoryVT();
14200 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14201 }
14202 return L->getMemoryVT().getSizeInBits() == 16;
14203 }
14204 default:
14205 return false;
14206 }
14207}
14208
14209// Returns true if the mask matches consecutive bytes, and the first byte
14210// begins at a power of 2 byte offset from 0th byte
14211static bool addresses16Bits(int Mask) {
14212 int Low8 = Mask & 0xff;
14213 int Hi8 = (Mask & 0xff00) >> 8;
14214
14215 assert(Low8 < 8 && Hi8 < 8);
14216 // Are the bytes contiguous in the order of increasing addresses.
14217 bool IsConsecutive = (Hi8 - Low8 == 1);
14218 // Is the first byte at location that is aligned for 16 bit instructions.
14219 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14220 // In this case, we still need code to extract the 16 bit operand, so it
14221 // is better to use i8 v_perm
14222 bool Is16Aligned = !(Low8 % 2);
14223
14224 return IsConsecutive && Is16Aligned;
14225}
14226
14227// Do not lower into v_perm if the operands are actually 16 bit
14228// and the selected bits (based on PermMask) correspond with two
14229// easily addressable 16 bit operands.
14230static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
14231 SDValue &OtherOp) {
14232 int Low16 = PermMask & 0xffff;
14233 int Hi16 = (PermMask & 0xffff0000) >> 16;
14234
14235 auto TempOp = peekThroughBitcasts(V: Op);
14236 auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
14237
14238 auto OpIs16Bit =
14239 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp);
14240 if (!OpIs16Bit)
14241 return true;
14242
14243 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14244 isExtendedFrom16Bits(Operand&: TempOtherOp);
14245 if (!OtherOpIs16Bit)
14246 return true;
14247
14248 // Do we cleanly address both
14249 return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16);
14250}
14251
14252static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
14253 unsigned DWordOffset) {
14254 SDValue Ret;
14255
14256 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14257 // ByteProvider must be at least 8 bits
14258 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14259
14260 if (TypeSize <= 32)
14261 return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32);
14262
14263 if (Src.getValueType().isVector()) {
14264 auto ScalarTySize = Src.getScalarValueSizeInBits();
14265 auto ScalarTy = Src.getValueType().getScalarType();
14266 if (ScalarTySize == 32) {
14267 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src,
14268 N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32));
14269 }
14270 if (ScalarTySize > 32) {
14271 Ret = DAG.getNode(
14272 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src,
14273 N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / 32), DL: SL, VT: MVT::i32));
14274 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14275 if (ShiftVal)
14276 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret,
14277 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
14278 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14279 }
14280
14281 assert(ScalarTySize < 32);
14282 auto NumElements = TypeSize / ScalarTySize;
14283 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14284 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14285 auto NumElementsIn32 = 32 / ScalarTySize;
14286 auto NumAvailElements = DWordOffset < Trunc32Elements
14287 ? NumElementsIn32
14288 : NumElements - NormalizedTrunc;
14289
14290 SmallVector<SDValue, 4> VecSrcs;
14291 DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
14292 Count: NumAvailElements);
14293
14294 Ret = DAG.getBuildVector(
14295 VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
14296 Ops: VecSrcs);
14297 return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14298 }
14299
14300 /// Scalar Type
14301 auto ShiftVal = 32 * DWordOffset;
14302 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src,
14303 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
14304 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14305}
14306
14307static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
14308 SelectionDAG &DAG = DCI.DAG;
14309 [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0);
14310 SmallVector<ByteProvider<SDValue>, 8> PermNodes;
14311
14312 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14313 assert(VT == MVT::i32);
14314 for (int i = 0; i < 4; i++) {
14315 // Find the ByteProvider that provides the ith byte of the result of OR
14316 std::optional<ByteProvider<SDValue>> P =
14317 calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i);
14318 // TODO support constantZero
14319 if (!P || P->isConstantZero())
14320 return SDValue();
14321
14322 PermNodes.push_back(Elt: *P);
14323 }
14324 if (PermNodes.size() != 4)
14325 return SDValue();
14326
14327 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14328 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14329 uint64_t PermMask = 0x00000000;
14330 for (size_t i = 0; i < PermNodes.size(); i++) {
14331 auto PermOp = PermNodes[i];
14332 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14333 // by sizeof(Src2) = 4
14334 int SrcByteAdjust = 4;
14335
14336 // If the Src uses a byte from a different DWORD, then it corresponds
14337 // with a difference source
14338 if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) ||
14339 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14340 if (SecondSrc)
14341 if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) ||
14342 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14343 return SDValue();
14344
14345 // Set the index of the second distinct Src node
14346 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14347 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14348 SrcByteAdjust = 0;
14349 }
14350 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14351 assert(!DAG.getDataLayout().isBigEndian());
14352 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14353 }
14354 SDLoc DL(N);
14355 SDValue Op = *PermNodes[FirstSrc.first].Src;
14356 Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
14357 assert(Op.getValueSizeInBits() == 32);
14358
14359 // Check that we are not just extracting the bytes in order from an op
14360 if (!SecondSrc) {
14361 int Low16 = PermMask & 0xffff;
14362 int Hi16 = (PermMask & 0xffff0000) >> 16;
14363
14364 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14365 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14366
14367 // The perm op would really just produce Op. So combine into Op
14368 if (WellFormedLow && WellFormedHi)
14369 return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op);
14370 }
14371
14372 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14373
14374 if (SecondSrc) {
14375 OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second);
14376 assert(OtherOp.getValueSizeInBits() == 32);
14377 }
14378
14379 // Check that we haven't just recreated the same FSHR node.
14380 if (N->getOpcode() == ISD::FSHR &&
14381 (N->getOperand(Num: 0) == Op || N->getOperand(Num: 0) == OtherOp) &&
14382 (N->getOperand(Num: 1) == Op || N->getOperand(Num: 1) == OtherOp))
14383 return SDValue();
14384
14385 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14386
14387 assert(Op.getValueType().isByteSized() &&
14388 OtherOp.getValueType().isByteSized());
14389
14390 // If the ultimate src is less than 32 bits, then we will only be
14391 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14392 // CalculateByteProvider would not have returned Op as source if we
14393 // used a byte that is outside its ValueType. Thus, we are free to
14394 // ANY_EXTEND as the extended bits are dont-cares.
14395 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32);
14396 OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32);
14397
14398 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp,
14399 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
14400 }
14401 return SDValue();
14402}
14403
14404SDValue SITargetLowering::performOrCombine(SDNode *N,
14405 DAGCombinerInfo &DCI) const {
14406 SelectionDAG &DAG = DCI.DAG;
14407 SDValue LHS = N->getOperand(Num: 0);
14408 SDValue RHS = N->getOperand(Num: 1);
14409
14410 EVT VT = N->getValueType(ResNo: 0);
14411 if (VT == MVT::i1) {
14412 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14413 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14414 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14415 SDValue Src = LHS.getOperand(i: 0);
14416 if (Src != RHS.getOperand(i: 0))
14417 return SDValue();
14418
14419 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
14420 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
14421 if (!CLHS || !CRHS)
14422 return SDValue();
14423
14424 // Only 10 bits are used.
14425 static const uint32_t MaxMask = 0x3ff;
14426
14427 uint32_t NewMask =
14428 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14429 SDLoc DL(N);
14430 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: Src,
14431 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
14432 }
14433
14434 return SDValue();
14435 }
14436
14437 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14438 if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
14439 LHS.getOpcode() == AMDGPUISD::PERM &&
14440 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
14441 uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1));
14442 if (!Sel)
14443 return SDValue();
14444
14445 Sel |= LHS.getConstantOperandVal(i: 2);
14446 SDLoc DL(N);
14447 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
14448 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14449 }
14450
14451 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14452 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14453 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14454 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
14455
14456 // If all the uses of an or need to extract the individual elements, do not
14457 // attempt to lower into v_perm
14458 auto usesCombinedOperand = [](SDNode *OrUse) {
14459 // If we have any non-vectorized use, then it is a candidate for v_perm
14460 if (OrUse->getOpcode() != ISD::BITCAST ||
14461 !OrUse->getValueType(ResNo: 0).isVector())
14462 return true;
14463
14464 // If we have any non-vectorized use, then it is a candidate for v_perm
14465 for (auto *VUser : OrUse->users()) {
14466 if (!VUser->getValueType(ResNo: 0).isVector())
14467 return true;
14468
14469 // If the use of a vector is a store, then combining via a v_perm
14470 // is beneficial.
14471 // TODO -- whitelist more uses
14472 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14473 if (VUser->getOpcode() == VectorwiseOp)
14474 return true;
14475 }
14476 return false;
14477 };
14478
14479 if (!any_of(Range: N->users(), P: usesCombinedOperand))
14480 return SDValue();
14481
14482 uint32_t LHSMask = getPermuteMask(V: LHS);
14483 uint32_t RHSMask = getPermuteMask(V: RHS);
14484
14485 if (LHSMask != ~0u && RHSMask != ~0u) {
14486 // Canonicalize the expression in an attempt to have fewer unique masks
14487 // and therefore fewer registers used to hold the masks.
14488 if (LHSMask > RHSMask) {
14489 std::swap(a&: LHSMask, b&: RHSMask);
14490 std::swap(a&: LHS, b&: RHS);
14491 }
14492
14493 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14494 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14495 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14496 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14497
14498 // Check of we need to combine values from two sources within a byte.
14499 if (!(LHSUsedLanes & RHSUsedLanes) &&
14500 // If we select high and lower word keep it for SDWA.
14501 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14502 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14503 // Kill zero bytes selected by other mask. Zero value is 0xc.
14504 LHSMask &= ~RHSUsedLanes;
14505 RHSMask &= ~LHSUsedLanes;
14506 // Add 4 to each active LHS lane
14507 LHSMask |= LHSUsedLanes & 0x04040404;
14508 // Combine masks
14509 uint32_t Sel = LHSMask | RHSMask;
14510 SDLoc DL(N);
14511
14512 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
14513 N2: RHS.getOperand(i: 0),
14514 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14515 }
14516 }
14517 if (LHSMask == ~0u || RHSMask == ~0u) {
14518 if (SDValue Perm = matchPERM(N, DCI))
14519 return Perm;
14520 }
14521 }
14522
14523 // Detect identity v2i32 OR and replace with identity source node.
14524 // Specifically an Or that has operands constructed from the same source node
14525 // via extract_vector_elt and build_vector. I.E.
14526 // v2i32 or(
14527 // v2i32 build_vector(
14528 // i32 extract_elt(%IdentitySrc, 0),
14529 // i32 0
14530 // ),
14531 // v2i32 build_vector(
14532 // i32 0,
14533 // i32 extract_elt(%IdentitySrc, 1)
14534 // ) )
14535 // =>
14536 // v2i32 %IdentitySrc
14537
14538 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14539 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14540
14541 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1));
14542 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(Val: RHS->getOperand(Num: 0));
14543
14544 // Test for and normalise build vectors.
14545 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14546
14547 // Get the extract_vector_element operands.
14548 SDValue LEVE = LHS->getOperand(Num: 0);
14549 SDValue REVE = RHS->getOperand(Num: 1);
14550
14551 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14552 REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
14553 // Check that different elements from the same vector are
14554 // extracted.
14555 if (LEVE->getOperand(Num: 0) == REVE->getOperand(Num: 0) &&
14556 LEVE->getOperand(Num: 1) != REVE->getOperand(Num: 1)) {
14557 SDValue IdentitySrc = LEVE.getOperand(i: 0);
14558 return IdentitySrc;
14559 }
14560 }
14561 }
14562 }
14563
14564 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14565 return SDValue();
14566
14567 // TODO: This could be a generic combine with a predicate for extracting the
14568 // high half of an integer being free.
14569
14570 // (or i64:x, (zero_extend i32:y)) ->
14571 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14572 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14573 RHS.getOpcode() != ISD::ZERO_EXTEND)
14574 std::swap(a&: LHS, b&: RHS);
14575
14576 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14577 SDValue ExtSrc = RHS.getOperand(i: 0);
14578 EVT SrcVT = ExtSrc.getValueType();
14579 if (SrcVT == MVT::i32) {
14580 SDLoc SL(N);
14581 auto [LowLHS, HiBits] = split64BitValue(Op: LHS, DAG);
14582 SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc);
14583
14584 DCI.AddToWorklist(N: LowOr.getNode());
14585 DCI.AddToWorklist(N: HiBits.getNode());
14586
14587 SDValue Vec =
14588 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: LowOr, N2: HiBits);
14589 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
14590 }
14591 }
14592
14593 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
14594 if (CRHS) {
14595 if (SDValue Split = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR,
14596 LHS: N->getOperand(Num: 0), CRHS))
14597 return Split;
14598 }
14599
14600 return SDValue();
14601}
14602
14603SDValue SITargetLowering::performXorCombine(SDNode *N,
14604 DAGCombinerInfo &DCI) const {
14605 if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
14606 return RV;
14607
14608 SDValue LHS = N->getOperand(Num: 0);
14609 SDValue RHS = N->getOperand(Num: 1);
14610
14611 const ConstantSDNode *CRHS = isConstOrConstSplat(N: RHS);
14612 SelectionDAG &DAG = DCI.DAG;
14613
14614 EVT VT = N->getValueType(ResNo: 0);
14615 if (CRHS && VT == MVT::i64) {
14616 if (SDValue Split =
14617 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS))
14618 return Split;
14619 }
14620
14621 // v2i32 (xor (vselect cc, x, y), K) ->
14622 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14623 // replaced with source modifiers when the select is lowered to CNDMASK.
14624 unsigned Opc = LHS.getOpcode();
14625 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14626 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14627 CRHS && CRHS->getAPIntValue().isSignMask()) {
14628 SDValue CC = LHS->getOperand(Num: 0);
14629 SDValue TRUE = LHS->getOperand(Num: 1);
14630 SDValue FALSE = LHS->getOperand(Num: 2);
14631 SDValue XTrue = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT, N1: TRUE, N2: RHS);
14632 SDValue XFalse = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT, N1: FALSE, N2: RHS);
14633 SDValue XSelect =
14634 DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT, N1: CC, N2: XTrue, N3: XFalse);
14635 return XSelect;
14636 }
14637
14638 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14639 // fneg-like xors into 64-bit select.
14640 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14641 // This looks like an fneg, try to fold as a source modifier.
14642 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14643 shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
14644 // xor (select c, a, b), 0x80000000 ->
14645 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14646 SDLoc DL(N);
14647 SDValue CastLHS =
14648 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 1));
14649 SDValue CastRHS =
14650 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 2));
14651 SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS);
14652 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS);
14653 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32,
14654 N1: LHS->getOperand(Num: 0), N2: FNegLHS, N3: FNegRHS);
14655 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
14656 }
14657 }
14658
14659 return SDValue();
14660}
14661
14662SDValue
14663SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
14664 DAGCombinerInfo &DCI) const {
14665 if (!Subtarget->has16BitInsts() ||
14666 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
14667 return SDValue();
14668
14669 EVT VT = N->getValueType(ResNo: 0);
14670 if (VT != MVT::i32)
14671 return SDValue();
14672
14673 SDValue Src = N->getOperand(Num: 0);
14674 if (Src.getValueType() != MVT::i16)
14675 return SDValue();
14676
14677 if (!Src->hasOneUse())
14678 return SDValue();
14679
14680 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
14681 // possible we're missing out on some combine opportunities, but we'd need to
14682 // weigh the cost of extracting the byte from the upper dwords.
14683
14684 std::optional<ByteProvider<SDValue>> BP0 =
14685 calculateByteProvider(Op: SDValue(N, 0), Index: 0, Depth: 0, StartingIndex: 0);
14686 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14687 return SDValue();
14688 SDValue V0 = *BP0->Src;
14689
14690 std::optional<ByteProvider<SDValue>> BP1 =
14691 calculateByteProvider(Op: SDValue(N, 0), Index: 1, Depth: 0, StartingIndex: 1);
14692 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14693 return SDValue();
14694
14695 SDValue V1 = *BP1->Src;
14696
14697 if (V0 == V1)
14698 return SDValue();
14699
14700 SelectionDAG &DAG = DCI.DAG;
14701 SDLoc DL(N);
14702 uint32_t PermMask = 0x0c0c0c0c;
14703 if (V0) {
14704 V0 = DAG.getBitcastedAnyExtOrTrunc(Op: V0, DL, VT: MVT::i32);
14705 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14706 }
14707
14708 if (V1) {
14709 V1 = DAG.getBitcastedAnyExtOrTrunc(Op: V1, DL, VT: MVT::i32);
14710 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14711 }
14712
14713 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: V0, N2: V1,
14714 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
14715}
14716
14717SDValue
14718SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14719 DAGCombinerInfo &DCI) const {
14720 SDValue Src = N->getOperand(Num: 0);
14721 auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1));
14722
14723 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14724 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14725 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14726 VTSign->getVT() == MVT::i8) ||
14727 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14728 VTSign->getVT() == MVT::i16))) {
14729 assert(Subtarget->hasScalarSubwordLoads() &&
14730 "s_buffer_load_{u8, i8} are supported "
14731 "in GFX12 (or newer) architectures.");
14732 EVT VT = Src.getValueType();
14733 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14734 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14735 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14736 SDLoc DL(N);
14737 SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32);
14738 SDValue Ops[] = {
14739 Src.getOperand(i: 0), // source register
14740 Src.getOperand(i: 1), // offset
14741 Src.getOperand(i: 2) // cachePolicy
14742 };
14743 auto *M = cast<MemSDNode>(Val&: Src);
14744 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14745 Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
14746 SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
14747 return LoadVal;
14748 }
14749 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14750 VTSign->getVT() == MVT::i8) ||
14751 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14752 VTSign->getVT() == MVT::i16)) &&
14753 Src.hasOneUse()) {
14754 auto *M = cast<MemSDNode>(Val&: Src);
14755 SDValue Ops[] = {Src.getOperand(i: 0), // Chain
14756 Src.getOperand(i: 1), // rsrc
14757 Src.getOperand(i: 2), // vindex
14758 Src.getOperand(i: 3), // voffset
14759 Src.getOperand(i: 4), // soffset
14760 Src.getOperand(i: 5), // offset
14761 Src.getOperand(i: 6), Src.getOperand(i: 7)};
14762 // replace with BUFFER_LOAD_BYTE/SHORT
14763 SDVTList ResList =
14764 DCI.DAG.getVTList(VT1: MVT::i32, VT2: Src.getOperand(i: 0).getValueType());
14765 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14766 ? AMDGPUISD::BUFFER_LOAD_BYTE
14767 : AMDGPUISD::BUFFER_LOAD_SHORT;
14768 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14769 Opcode: Opc, dl: SDLoc(N), VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
14770 return DCI.DAG.getMergeValues(
14771 Ops: {BufferLoadSignExt, BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N));
14772 }
14773 return SDValue();
14774}
14775
14776SDValue SITargetLowering::performClassCombine(SDNode *N,
14777 DAGCombinerInfo &DCI) const {
14778 SelectionDAG &DAG = DCI.DAG;
14779 SDValue Mask = N->getOperand(Num: 1);
14780
14781 // fp_class x, 0 -> false
14782 if (isNullConstant(V: Mask))
14783 return DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1);
14784
14785 if (N->getOperand(Num: 0).isUndef())
14786 return DAG.getUNDEF(VT: MVT::i1);
14787
14788 return SDValue();
14789}
14790
14791SDValue SITargetLowering::performRcpCombine(SDNode *N,
14792 DAGCombinerInfo &DCI) const {
14793 EVT VT = N->getValueType(ResNo: 0);
14794 SDValue N0 = N->getOperand(Num: 0);
14795
14796 if (N0.isUndef()) {
14797 return DCI.DAG.getConstantFP(Val: APFloat::getQNaN(Sem: VT.getFltSemantics()),
14798 DL: SDLoc(N), VT);
14799 }
14800
14801 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14802 N0.getOpcode() == ISD::SINT_TO_FP)) {
14803 return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc(N), VT, Operand: N0,
14804 Flags: N->getFlags());
14805 }
14806
14807 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14808 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14809 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14810 return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT, Operand: N0.getOperand(i: 0),
14811 Flags: N->getFlags());
14812 }
14813
14814 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
14815}
14816
14817bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
14818 unsigned MaxDepth) const {
14819 unsigned Opcode = Op.getOpcode();
14820 if (Opcode == ISD::FCANONICALIZE)
14821 return true;
14822
14823 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
14824 const auto &F = CFP->getValueAPF();
14825 if (F.isNaN() && F.isSignaling())
14826 return false;
14827 if (!F.isDenormal())
14828 return true;
14829
14830 DenormalMode Mode =
14831 DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
14832 return Mode == DenormalMode::getIEEE();
14833 }
14834
14835 // If source is a result of another standard FP operation it is already in
14836 // canonical form.
14837 if (MaxDepth == 0)
14838 return false;
14839
14840 switch (Opcode) {
14841 // These will flush denorms if required.
14842 case ISD::FADD:
14843 case ISD::FSUB:
14844 case ISD::FMUL:
14845 case ISD::FCEIL:
14846 case ISD::FFLOOR:
14847 case ISD::FMA:
14848 case ISD::FMAD:
14849 case ISD::FSQRT:
14850 case ISD::FDIV:
14851 case ISD::FREM:
14852 case ISD::FP_ROUND:
14853 case ISD::FP_EXTEND:
14854 case ISD::FP16_TO_FP:
14855 case ISD::FP_TO_FP16:
14856 case ISD::BF16_TO_FP:
14857 case ISD::FP_TO_BF16:
14858 case ISD::FLDEXP:
14859 case AMDGPUISD::FMUL_LEGACY:
14860 case AMDGPUISD::FMAD_FTZ:
14861 case AMDGPUISD::RCP:
14862 case AMDGPUISD::RSQ:
14863 case AMDGPUISD::RSQ_CLAMP:
14864 case AMDGPUISD::RCP_LEGACY:
14865 case AMDGPUISD::RCP_IFLAG:
14866 case AMDGPUISD::LOG:
14867 case AMDGPUISD::EXP:
14868 case AMDGPUISD::DIV_SCALE:
14869 case AMDGPUISD::DIV_FMAS:
14870 case AMDGPUISD::DIV_FIXUP:
14871 case AMDGPUISD::FRACT:
14872 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14873 case AMDGPUISD::CVT_F32_UBYTE0:
14874 case AMDGPUISD::CVT_F32_UBYTE1:
14875 case AMDGPUISD::CVT_F32_UBYTE2:
14876 case AMDGPUISD::CVT_F32_UBYTE3:
14877 case AMDGPUISD::FP_TO_FP16:
14878 case AMDGPUISD::SIN_HW:
14879 case AMDGPUISD::COS_HW:
14880 return true;
14881
14882 // It can/will be lowered or combined as a bit operation.
14883 // Need to check their input recursively to handle.
14884 case ISD::FNEG:
14885 case ISD::FABS:
14886 case ISD::FCOPYSIGN:
14887 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
14888
14889 case ISD::AND:
14890 if (Op.getValueType() == MVT::i32) {
14891 // Be careful as we only know it is a bitcast floating point type. It
14892 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14893 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14894 // is valid to optimize for all types.
14895 if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
14896 if (RHS->getZExtValue() == 0xffff0000) {
14897 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
14898 }
14899 }
14900 }
14901 break;
14902
14903 case ISD::FSIN:
14904 case ISD::FCOS:
14905 case ISD::FSINCOS:
14906 return Op.getValueType().getScalarType() != MVT::f16;
14907
14908 case ISD::FMINNUM:
14909 case ISD::FMAXNUM:
14910 case ISD::FMINNUM_IEEE:
14911 case ISD::FMAXNUM_IEEE:
14912 case ISD::FMINIMUM:
14913 case ISD::FMAXIMUM:
14914 case ISD::FMINIMUMNUM:
14915 case ISD::FMAXIMUMNUM:
14916 case AMDGPUISD::CLAMP:
14917 case AMDGPUISD::FMED3:
14918 case AMDGPUISD::FMAX3:
14919 case AMDGPUISD::FMIN3:
14920 case AMDGPUISD::FMAXIMUM3:
14921 case AMDGPUISD::FMINIMUM3: {
14922 // FIXME: Shouldn't treat the generic operations different based these.
14923 // However, we aren't really required to flush the result from
14924 // minnum/maxnum..
14925
14926 // snans will be quieted, so we only need to worry about denormals.
14927 if (Subtarget->supportsMinMaxDenormModes() ||
14928 // FIXME: denormalsEnabledForType is broken for dynamic
14929 denormalsEnabledForType(DAG, VT: Op.getValueType()))
14930 return true;
14931
14932 // Flushing may be required.
14933 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14934 // targets need to check their input recursively.
14935
14936 // FIXME: Does this apply with clamp? It's implemented with max.
14937 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14938 if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), MaxDepth: MaxDepth - 1))
14939 return false;
14940 }
14941
14942 return true;
14943 }
14944 case ISD::SELECT: {
14945 return isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1) &&
14946 isCanonicalized(DAG, Op: Op.getOperand(i: 2), MaxDepth: MaxDepth - 1);
14947 }
14948 case ISD::BUILD_VECTOR: {
14949 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14950 SDValue SrcOp = Op.getOperand(i);
14951 if (!isCanonicalized(DAG, Op: SrcOp, MaxDepth: MaxDepth - 1))
14952 return false;
14953 }
14954
14955 return true;
14956 }
14957 case ISD::EXTRACT_VECTOR_ELT:
14958 case ISD::EXTRACT_SUBVECTOR: {
14959 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
14960 }
14961 case ISD::INSERT_VECTOR_ELT: {
14962 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1) &&
14963 isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1);
14964 }
14965 case ISD::UNDEF:
14966 // Could be anything.
14967 return false;
14968
14969 case ISD::BITCAST:
14970 // TODO: This is incorrect as it loses track of the operand's type. We may
14971 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14972 // same bits that are canonicalized in one type need not be in the other.
14973 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
14974 case ISD::TRUNCATE: {
14975 // Hack round the mess we make when legalizing extract_vector_elt
14976 if (Op.getValueType() == MVT::i16) {
14977 SDValue TruncSrc = Op.getOperand(i: 0);
14978 if (TruncSrc.getValueType() == MVT::i32 &&
14979 TruncSrc.getOpcode() == ISD::BITCAST &&
14980 TruncSrc.getOperand(i: 0).getValueType() == MVT::v2f16) {
14981 return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), MaxDepth: MaxDepth - 1);
14982 }
14983 }
14984 return false;
14985 }
14986 case ISD::INTRINSIC_WO_CHAIN: {
14987 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
14988 // TODO: Handle more intrinsics
14989 switch (IntrinsicID) {
14990 case Intrinsic::amdgcn_cvt_pkrtz:
14991 case Intrinsic::amdgcn_cubeid:
14992 case Intrinsic::amdgcn_frexp_mant:
14993 case Intrinsic::amdgcn_fdot2:
14994 case Intrinsic::amdgcn_rcp:
14995 case Intrinsic::amdgcn_rsq:
14996 case Intrinsic::amdgcn_rsq_clamp:
14997 case Intrinsic::amdgcn_rcp_legacy:
14998 case Intrinsic::amdgcn_rsq_legacy:
14999 case Intrinsic::amdgcn_trig_preop:
15000 case Intrinsic::amdgcn_tanh:
15001 case Intrinsic::amdgcn_log:
15002 case Intrinsic::amdgcn_exp2:
15003 case Intrinsic::amdgcn_sqrt:
15004 return true;
15005 default:
15006 break;
15007 }
15008
15009 break;
15010 }
15011 default:
15012 break;
15013 }
15014
15015 // FIXME: denormalsEnabledForType is broken for dynamic
15016 return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
15017 DAG.isKnownNeverSNaN(Op);
15018}
15019
15020bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
15021 unsigned MaxDepth) const {
15022 const MachineRegisterInfo &MRI = MF.getRegInfo();
15023 MachineInstr *MI = MRI.getVRegDef(Reg);
15024 unsigned Opcode = MI->getOpcode();
15025
15026 if (Opcode == AMDGPU::G_FCANONICALIZE)
15027 return true;
15028
15029 std::optional<FPValueAndVReg> FCR;
15030 // Constant splat (can be padded with undef) or scalar constant.
15031 if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
15032 if (FCR->Value.isSignaling())
15033 return false;
15034 if (!FCR->Value.isDenormal())
15035 return true;
15036
15037 DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics());
15038 return Mode == DenormalMode::getIEEE();
15039 }
15040
15041 if (MaxDepth == 0)
15042 return false;
15043
15044 switch (Opcode) {
15045 case AMDGPU::G_FADD:
15046 case AMDGPU::G_FSUB:
15047 case AMDGPU::G_FMUL:
15048 case AMDGPU::G_FCEIL:
15049 case AMDGPU::G_FFLOOR:
15050 case AMDGPU::G_FRINT:
15051 case AMDGPU::G_FNEARBYINT:
15052 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15053 case AMDGPU::G_INTRINSIC_TRUNC:
15054 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15055 case AMDGPU::G_FMA:
15056 case AMDGPU::G_FMAD:
15057 case AMDGPU::G_FSQRT:
15058 case AMDGPU::G_FDIV:
15059 case AMDGPU::G_FREM:
15060 case AMDGPU::G_FPOW:
15061 case AMDGPU::G_FPEXT:
15062 case AMDGPU::G_FLOG:
15063 case AMDGPU::G_FLOG2:
15064 case AMDGPU::G_FLOG10:
15065 case AMDGPU::G_FPTRUNC:
15066 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15067 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15068 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15069 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15070 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15071 return true;
15072 case AMDGPU::G_FNEG:
15073 case AMDGPU::G_FABS:
15074 case AMDGPU::G_FCOPYSIGN:
15075 return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1);
15076 case AMDGPU::G_FMINNUM:
15077 case AMDGPU::G_FMAXNUM:
15078 case AMDGPU::G_FMINNUM_IEEE:
15079 case AMDGPU::G_FMAXNUM_IEEE:
15080 case AMDGPU::G_FMINIMUM:
15081 case AMDGPU::G_FMAXIMUM:
15082 case AMDGPU::G_FMINIMUMNUM:
15083 case AMDGPU::G_FMAXIMUMNUM: {
15084 if (Subtarget->supportsMinMaxDenormModes() ||
15085 // FIXME: denormalsEnabledForType is broken for dynamic
15086 denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
15087 return true;
15088
15089 [[fallthrough]];
15090 }
15091 case AMDGPU::G_BUILD_VECTOR:
15092 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
15093 if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1))
15094 return false;
15095 return true;
15096 case AMDGPU::G_INTRINSIC:
15097 case AMDGPU::G_INTRINSIC_CONVERGENT:
15098 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
15099 case Intrinsic::amdgcn_fmul_legacy:
15100 case Intrinsic::amdgcn_fmad_ftz:
15101 case Intrinsic::amdgcn_sqrt:
15102 case Intrinsic::amdgcn_fmed3:
15103 case Intrinsic::amdgcn_sin:
15104 case Intrinsic::amdgcn_cos:
15105 case Intrinsic::amdgcn_log:
15106 case Intrinsic::amdgcn_exp2:
15107 case Intrinsic::amdgcn_log_clamp:
15108 case Intrinsic::amdgcn_rcp:
15109 case Intrinsic::amdgcn_rcp_legacy:
15110 case Intrinsic::amdgcn_rsq:
15111 case Intrinsic::amdgcn_rsq_clamp:
15112 case Intrinsic::amdgcn_rsq_legacy:
15113 case Intrinsic::amdgcn_div_scale:
15114 case Intrinsic::amdgcn_div_fmas:
15115 case Intrinsic::amdgcn_div_fixup:
15116 case Intrinsic::amdgcn_fract:
15117 case Intrinsic::amdgcn_cvt_pkrtz:
15118 case Intrinsic::amdgcn_cubeid:
15119 case Intrinsic::amdgcn_cubema:
15120 case Intrinsic::amdgcn_cubesc:
15121 case Intrinsic::amdgcn_cubetc:
15122 case Intrinsic::amdgcn_frexp_mant:
15123 case Intrinsic::amdgcn_fdot2:
15124 case Intrinsic::amdgcn_trig_preop:
15125 case Intrinsic::amdgcn_tanh:
15126 return true;
15127 default:
15128 break;
15129 }
15130
15131 [[fallthrough]];
15132 default:
15133 return false;
15134 }
15135
15136 llvm_unreachable("invalid operation");
15137}
15138
15139// Constant fold canonicalize.
15140SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15141 const SDLoc &SL, EVT VT,
15142 const APFloat &C) const {
15143 // Flush denormals to 0 if not enabled.
15144 if (C.isDenormal()) {
15145 DenormalMode Mode =
15146 DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
15147 if (Mode == DenormalMode::getPreserveSign()) {
15148 return DAG.getConstantFP(
15149 Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
15150 }
15151
15152 if (Mode != DenormalMode::getIEEE())
15153 return SDValue();
15154 }
15155
15156 if (C.isNaN()) {
15157 APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
15158 if (C.isSignaling()) {
15159 // Quiet a signaling NaN.
15160 // FIXME: Is this supposed to preserve payload bits?
15161 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
15162 }
15163
15164 // Make sure it is the canonical NaN bitpattern.
15165 //
15166 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15167 // immediate?
15168 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15169 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
15170 }
15171
15172 // Already canonical.
15173 return DAG.getConstantFP(Val: C, DL: SL, VT);
15174}
15175
15176static bool vectorEltWillFoldAway(SDValue Op) {
15177 return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op);
15178}
15179
15180SDValue
15181SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15182 DAGCombinerInfo &DCI) const {
15183 SelectionDAG &DAG = DCI.DAG;
15184 SDValue N0 = N->getOperand(Num: 0);
15185 EVT VT = N->getValueType(ResNo: 0);
15186
15187 // fcanonicalize undef -> qnan
15188 if (N0.isUndef()) {
15189 APFloat QNaN = APFloat::getQNaN(Sem: VT.getFltSemantics());
15190 return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT);
15191 }
15192
15193 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
15194 EVT VT = N->getValueType(ResNo: 0);
15195 return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF());
15196 }
15197
15198 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15199 // (fcanonicalize k)
15200 //
15201 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15202
15203 // TODO: This could be better with wider vectors that will be split to v2f16,
15204 // and to consider uses since there aren't that many packed operations.
15205 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15206 isTypeLegal(VT: MVT::v2f16)) {
15207 SDLoc SL(N);
15208 SDValue NewElts[2];
15209 SDValue Lo = N0.getOperand(i: 0);
15210 SDValue Hi = N0.getOperand(i: 1);
15211 EVT EltVT = Lo.getValueType();
15212
15213 if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) {
15214 for (unsigned I = 0; I != 2; ++I) {
15215 SDValue Op = N0.getOperand(i: I);
15216 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
15217 NewElts[I] =
15218 getCanonicalConstantFP(DAG, SL, VT: EltVT, C: CFP->getValueAPF());
15219 } else if (Op.isUndef()) {
15220 // Handled below based on what the other operand is.
15221 NewElts[I] = Op;
15222 } else {
15223 NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
15224 }
15225 }
15226
15227 // If one half is undef, and one is constant, prefer a splat vector rather
15228 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15229 // cheaper to use and may be free with a packed operation.
15230 if (NewElts[0].isUndef()) {
15231 if (isa<ConstantFPSDNode>(Val: NewElts[1]))
15232 NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1])
15233 ? NewElts[1]
15234 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
15235 }
15236
15237 if (NewElts[1].isUndef()) {
15238 NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0])
15239 ? NewElts[0]
15240 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
15241 }
15242
15243 return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
15244 }
15245 }
15246
15247 return SDValue();
15248}
15249
15250static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15251 switch (Opc) {
15252 case ISD::FMAXNUM:
15253 case ISD::FMAXNUM_IEEE:
15254 case ISD::FMAXIMUMNUM:
15255 return AMDGPUISD::FMAX3;
15256 case ISD::FMAXIMUM:
15257 return AMDGPUISD::FMAXIMUM3;
15258 case ISD::SMAX:
15259 return AMDGPUISD::SMAX3;
15260 case ISD::UMAX:
15261 return AMDGPUISD::UMAX3;
15262 case ISD::FMINNUM:
15263 case ISD::FMINNUM_IEEE:
15264 case ISD::FMINIMUMNUM:
15265 return AMDGPUISD::FMIN3;
15266 case ISD::FMINIMUM:
15267 return AMDGPUISD::FMINIMUM3;
15268 case ISD::SMIN:
15269 return AMDGPUISD::SMIN3;
15270 case ISD::UMIN:
15271 return AMDGPUISD::UMIN3;
15272 default:
15273 llvm_unreachable("Not a min/max opcode");
15274 }
15275}
15276
15277SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15278 const SDLoc &SL, SDValue Src,
15279 SDValue MinVal,
15280 SDValue MaxVal,
15281 bool Signed) const {
15282
15283 // med3 comes from
15284 // min(max(x, K0), K1), K0 < K1
15285 // max(min(x, K0), K1), K1 < K0
15286 //
15287 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15288 // min/max op.
15289 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
15290 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
15291
15292 if (!MinK || !MaxK)
15293 return SDValue();
15294
15295 if (Signed) {
15296 if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
15297 return SDValue();
15298 } else {
15299 if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
15300 return SDValue();
15301 }
15302
15303 EVT VT = MinK->getValueType(ResNo: 0);
15304 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15305 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15306 return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
15307
15308 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15309 // not available, but this is unlikely to be profitable as constants
15310 // will often need to be materialized & extended, especially on
15311 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15312 return SDValue();
15313}
15314
15315static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
15316 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
15317 return C;
15318
15319 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
15320 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15321 return C;
15322 }
15323
15324 return nullptr;
15325}
15326
15327SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15328 const SDLoc &SL, SDValue Op0,
15329 SDValue Op1) const {
15330 ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
15331 if (!K1)
15332 return SDValue();
15333
15334 ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1));
15335 if (!K0)
15336 return SDValue();
15337
15338 // Ordered >= (although NaN inputs should have folded away by now).
15339 if (K0->getValueAPF() > K1->getValueAPF())
15340 return SDValue();
15341
15342 // med3 with a nan input acts like
15343 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15344 //
15345 // So the result depends on whether the IEEE mode bit is enabled or not with a
15346 // signaling nan input.
15347 // ieee=1
15348 // s0 snan: yields s2
15349 // s1 snan: yields s2
15350 // s2 snan: qnan
15351
15352 // s0 qnan: min(s1, s2)
15353 // s1 qnan: min(s0, s2)
15354 // s2 qnan: min(s0, s1)
15355
15356 // ieee=0
15357 // s0 snan: min(s1, s2)
15358 // s1 snan: min(s0, s2)
15359 // s2 snan: qnan
15360
15361 // s0 qnan: min(s1, s2)
15362 // s1 qnan: min(s0, s2)
15363 // s2 qnan: min(s0, s1)
15364 const MachineFunction &MF = DAG.getMachineFunction();
15365 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15366
15367 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15368 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15369 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15370 EVT VT = Op0.getValueType();
15371 if (Info->getMode().DX10Clamp) {
15372 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15373 // hardware fmed3 behavior converting to a min.
15374 // FIXME: Should this be allowing -0.0?
15375 if (K1->isExactlyValue(V: 1.0) && K0->isExactlyValue(V: 0.0))
15376 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0));
15377 }
15378
15379 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15380 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15381 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15382 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15383 // then give the other result, which is different from med3 with a NaN
15384 // input.
15385 SDValue Var = Op0.getOperand(i: 0);
15386 if (!DAG.isKnownNeverSNaN(Op: Var))
15387 return SDValue();
15388
15389 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15390
15391 if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) &&
15392 (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) {
15393 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0), N1: Var,
15394 N2: SDValue(K0, 0), N3: SDValue(K1, 0));
15395 }
15396 }
15397
15398 return SDValue();
15399}
15400
15401/// \return true if the subtarget supports minimum3 and maximum3 with the given
15402/// base min/max opcode \p Opc for type \p VT.
15403static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15404 EVT VT) {
15405 switch (Opc) {
15406 case ISD::FMINNUM:
15407 case ISD::FMAXNUM:
15408 case ISD::FMINNUM_IEEE:
15409 case ISD::FMAXNUM_IEEE:
15410 case ISD::FMINIMUMNUM:
15411 case ISD::FMAXIMUMNUM:
15412 case AMDGPUISD::FMIN_LEGACY:
15413 case AMDGPUISD::FMAX_LEGACY:
15414 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15415 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15416 case ISD::FMINIMUM:
15417 case ISD::FMAXIMUM:
15418 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15419 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15420 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15421 case ISD::SMAX:
15422 case ISD::SMIN:
15423 case ISD::UMAX:
15424 case ISD::UMIN:
15425 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15426 default:
15427 return false;
15428 }
15429
15430 llvm_unreachable("not a min/max opcode");
15431}
15432
15433SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15434 DAGCombinerInfo &DCI) const {
15435 SelectionDAG &DAG = DCI.DAG;
15436
15437 EVT VT = N->getValueType(ResNo: 0);
15438 unsigned Opc = N->getOpcode();
15439 SDValue Op0 = N->getOperand(Num: 0);
15440 SDValue Op1 = N->getOperand(Num: 1);
15441
15442 // Only do this if the inner op has one use since this will just increases
15443 // register pressure for no benefit.
15444
15445 if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) {
15446 // max(max(a, b), c) -> max3(a, b, c)
15447 // min(min(a, b), c) -> min3(a, b, c)
15448 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15449 SDLoc DL(N);
15450 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
15451 N1: Op0.getOperand(i: 0), N2: Op0.getOperand(i: 1), N3: Op1);
15452 }
15453
15454 // Try commuted.
15455 // max(a, max(b, c)) -> max3(a, b, c)
15456 // min(a, min(b, c)) -> min3(a, b, c)
15457 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15458 SDLoc DL(N);
15459 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
15460 N1: Op0, N2: Op1.getOperand(i: 0), N3: Op1.getOperand(i: 1));
15461 }
15462 }
15463
15464 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15465 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15466 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15467 if (SDValue Med3 = performIntMed3ImmCombine(
15468 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true))
15469 return Med3;
15470 }
15471 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15472 if (SDValue Med3 = performIntMed3ImmCombine(
15473 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true))
15474 return Med3;
15475 }
15476
15477 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15478 if (SDValue Med3 = performIntMed3ImmCombine(
15479 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false))
15480 return Med3;
15481 }
15482 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15483 if (SDValue Med3 = performIntMed3ImmCombine(
15484 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false))
15485 return Med3;
15486 }
15487
15488 // if !is_snan(x):
15489 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15490 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15491 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15492 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15493 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15494 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
15495 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
15496 (Opc == AMDGPUISD::FMIN_LEGACY &&
15497 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15498 (VT == MVT::f32 || VT == MVT::f64 ||
15499 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15500 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15501 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15502 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15503 Op0.hasOneUse()) {
15504 if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1))
15505 return Res;
15506 }
15507
15508 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15509 // for some types, but at a higher cost since it's implemented with a 3
15510 // operand form.
15511 const SDNodeFlags Flags = N->getFlags();
15512 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
15513 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15514 isOperationLegal(Op: ISD::FMINNUM_IEEE, VT: VT.getScalarType())) {
15515 unsigned NewOpc =
15516 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15517 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(N), VT, N1: Op0, N2: Op1, Flags);
15518 }
15519
15520 return SDValue();
15521}
15522
15523static bool isClampZeroToOne(SDValue A, SDValue B) {
15524 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
15525 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
15526 // FIXME: Should this be allowing -0.0?
15527 return (CA->isExactlyValue(V: 0.0) && CB->isExactlyValue(V: 1.0)) ||
15528 (CA->isExactlyValue(V: 1.0) && CB->isExactlyValue(V: 0.0));
15529 }
15530 }
15531
15532 return false;
15533}
15534
15535// FIXME: Should only worry about snans for version with chain.
15536SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15537 DAGCombinerInfo &DCI) const {
15538 EVT VT = N->getValueType(ResNo: 0);
15539 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15540 // NaNs. With a NaN input, the order of the operands may change the result.
15541
15542 SelectionDAG &DAG = DCI.DAG;
15543 SDLoc SL(N);
15544
15545 SDValue Src0 = N->getOperand(Num: 0);
15546 SDValue Src1 = N->getOperand(Num: 1);
15547 SDValue Src2 = N->getOperand(Num: 2);
15548
15549 if (isClampZeroToOne(A: Src0, B: Src1)) {
15550 // const_a, const_b, x -> clamp is safe in all cases including signaling
15551 // nans.
15552 // FIXME: Should this be allowing -0.0?
15553 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
15554 }
15555
15556 const MachineFunction &MF = DAG.getMachineFunction();
15557 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15558
15559 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15560 // handling no dx10-clamp?
15561 if (Info->getMode().DX10Clamp) {
15562 // If NaNs is clamped to 0, we are free to reorder the inputs.
15563
15564 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
15565 std::swap(a&: Src0, b&: Src1);
15566
15567 if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
15568 std::swap(a&: Src1, b&: Src2);
15569
15570 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
15571 std::swap(a&: Src0, b&: Src1);
15572
15573 if (isClampZeroToOne(A: Src1, B: Src2))
15574 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
15575 }
15576
15577 return SDValue();
15578}
15579
15580SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15581 DAGCombinerInfo &DCI) const {
15582 SDValue Src0 = N->getOperand(Num: 0);
15583 SDValue Src1 = N->getOperand(Num: 1);
15584 if (Src0.isUndef() && Src1.isUndef())
15585 return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
15586 return SDValue();
15587}
15588
15589// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15590// expanded into a set of cmp/select instructions.
15591bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
15592 unsigned NumElem,
15593 bool IsDivergentIdx,
15594 const GCNSubtarget *Subtarget) {
15595 if (UseDivergentRegisterIndexing)
15596 return false;
15597
15598 unsigned VecSize = EltSize * NumElem;
15599
15600 // Sub-dword vectors of size 2 dword or less have better implementation.
15601 if (VecSize <= 64 && EltSize < 32)
15602 return false;
15603
15604 // Always expand the rest of sub-dword instructions, otherwise it will be
15605 // lowered via memory.
15606 if (EltSize < 32)
15607 return true;
15608
15609 // Always do this if var-idx is divergent, otherwise it will become a loop.
15610 if (IsDivergentIdx)
15611 return true;
15612
15613 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15614 unsigned NumInsts = NumElem /* Number of compares */ +
15615 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15616
15617 // On some architectures (GFX9) movrel is not available and it's better
15618 // to expand.
15619 if (Subtarget->useVGPRIndexMode())
15620 return NumInsts <= 16;
15621
15622 // If movrel is available, use it instead of expanding for vector of 8
15623 // elements.
15624 if (Subtarget->hasMovrel())
15625 return NumInsts <= 15;
15626
15627 return true;
15628}
15629
15630bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
15631 SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1);
15632 if (isa<ConstantSDNode>(Val: Idx))
15633 return false;
15634
15635 SDValue Vec = N->getOperand(Num: 0);
15636 EVT VecVT = Vec.getValueType();
15637 EVT EltVT = VecVT.getVectorElementType();
15638 unsigned EltSize = EltVT.getSizeInBits();
15639 unsigned NumElem = VecVT.getVectorNumElements();
15640
15641 return SITargetLowering::shouldExpandVectorDynExt(
15642 EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget());
15643}
15644
15645SDValue
15646SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15647 DAGCombinerInfo &DCI) const {
15648 SDValue Vec = N->getOperand(Num: 0);
15649 SelectionDAG &DAG = DCI.DAG;
15650
15651 EVT VecVT = Vec.getValueType();
15652 EVT VecEltVT = VecVT.getVectorElementType();
15653 EVT ResVT = N->getValueType(ResNo: 0);
15654
15655 unsigned VecSize = VecVT.getSizeInBits();
15656 unsigned VecEltSize = VecEltVT.getSizeInBits();
15657
15658 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15659 allUsesHaveSourceMods(N)) {
15660 SDLoc SL(N);
15661 SDValue Idx = N->getOperand(Num: 1);
15662 SDValue Elt =
15663 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx);
15664 return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
15665 }
15666
15667 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15668 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15669 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15670 // depending on the shift operand. See e.g. performSraCombine().
15671 // This combine ensures that the optimisation is compatible with v2i32
15672 // legalised AND.
15673 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15674 Vec->getOperand(Num: 1)->getOpcode() == ISD::BUILD_VECTOR) {
15675
15676 const ConstantSDNode *C = isConstOrConstSplat(N: Vec.getOperand(i: 1));
15677 if (!C || C->getZExtValue() != 0x1f)
15678 return SDValue();
15679
15680 SDLoc SL(N);
15681 SDValue AndMask = DAG.getConstant(Val: 0x1f, DL: SL, VT: MVT::i32);
15682 SDValue EVE = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32,
15683 N1: Vec->getOperand(Num: 0), N2: N->getOperand(Num: 1));
15684 SDValue A = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: EVE, N2: AndMask);
15685 DAG.ReplaceAllUsesWith(From: N, To: A.getNode());
15686 }
15687
15688 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15689 // =>
15690 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15691 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15692 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15693 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15694 SDLoc SL(N);
15695 SDValue Idx = N->getOperand(Num: 1);
15696 unsigned Opc = Vec.getOpcode();
15697
15698 switch (Opc) {
15699 default:
15700 break;
15701 // TODO: Support other binary operations.
15702 case ISD::FADD:
15703 case ISD::FSUB:
15704 case ISD::FMUL:
15705 case ISD::ADD:
15706 case ISD::UMIN:
15707 case ISD::UMAX:
15708 case ISD::SMIN:
15709 case ISD::SMAX:
15710 case ISD::FMAXNUM:
15711 case ISD::FMINNUM:
15712 case ISD::FMAXNUM_IEEE:
15713 case ISD::FMINNUM_IEEE:
15714 case ISD::FMAXIMUM:
15715 case ISD::FMINIMUM: {
15716 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
15717 N1: Vec.getOperand(i: 0), N2: Idx);
15718 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
15719 N1: Vec.getOperand(i: 1), N2: Idx);
15720
15721 DCI.AddToWorklist(N: Elt0.getNode());
15722 DCI.AddToWorklist(N: Elt1.getNode());
15723 return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags());
15724 }
15725 }
15726 }
15727
15728 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15729 if (shouldExpandVectorDynExt(N)) {
15730 SDLoc SL(N);
15731 SDValue Idx = N->getOperand(Num: 1);
15732 SDValue V;
15733 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15734 SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
15735 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
15736 if (I == 0)
15737 V = Elt;
15738 else
15739 V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
15740 }
15741 return V;
15742 }
15743
15744 if (!DCI.isBeforeLegalize())
15745 return SDValue();
15746
15747 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15748 // elements. This exposes more load reduction opportunities by replacing
15749 // multiple small extract_vector_elements with a single 32-bit extract.
15750 auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
15751 if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15752 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15753 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
15754
15755 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15756 unsigned EltIdx = BitIndex / 32;
15757 unsigned LeftoverBitIdx = BitIndex % 32;
15758 SDLoc SL(N);
15759
15760 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
15761 DCI.AddToWorklist(N: Cast.getNode());
15762
15763 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast,
15764 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
15765 DCI.AddToWorklist(N: Elt.getNode());
15766 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt,
15767 N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32));
15768 DCI.AddToWorklist(N: Srl.getNode());
15769
15770 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15771 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
15772 DCI.AddToWorklist(N: Trunc.getNode());
15773
15774 if (VecEltVT == ResVT) {
15775 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
15776 }
15777
15778 assert(ResVT.isScalarInteger());
15779 return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
15780 }
15781
15782 return SDValue();
15783}
15784
15785SDValue
15786SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15787 DAGCombinerInfo &DCI) const {
15788 SDValue Vec = N->getOperand(Num: 0);
15789 SDValue Idx = N->getOperand(Num: 2);
15790 EVT VecVT = Vec.getValueType();
15791 EVT EltVT = VecVT.getVectorElementType();
15792
15793 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15794 // => BUILD_VECTOR n x select (e, const-idx)
15795 if (!shouldExpandVectorDynExt(N))
15796 return SDValue();
15797
15798 SelectionDAG &DAG = DCI.DAG;
15799 SDLoc SL(N);
15800 SDValue Ins = N->getOperand(Num: 1);
15801 EVT IdxVT = Idx.getValueType();
15802
15803 SmallVector<SDValue, 16> Ops;
15804 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15805 SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
15806 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
15807 SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
15808 Ops.push_back(Elt: V);
15809 }
15810
15811 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
15812}
15813
15814/// Return the source of an fp_extend from f16 to f32, or a converted FP
15815/// constant.
15816static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
15817 if (Src.getOpcode() == ISD::FP_EXTEND &&
15818 Src.getOperand(i: 0).getValueType() == MVT::f16) {
15819 return Src.getOperand(i: 0);
15820 }
15821
15822 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
15823 APFloat Val = CFP->getValueAPF();
15824 bool LosesInfo = true;
15825 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
15826 if (!LosesInfo)
15827 return DAG.getConstantFP(Val, DL: SDLoc(Src), VT: MVT::f16);
15828 }
15829
15830 return SDValue();
15831}
15832
15833SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15834 DAGCombinerInfo &DCI) const {
15835 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15836 "combine only useful on gfx8");
15837
15838 SDValue TruncSrc = N->getOperand(Num: 0);
15839 EVT VT = N->getValueType(ResNo: 0);
15840 if (VT != MVT::f16)
15841 return SDValue();
15842
15843 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15844 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15845 return SDValue();
15846
15847 SelectionDAG &DAG = DCI.DAG;
15848 SDLoc SL(N);
15849
15850 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15851 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15852 // casting back.
15853
15854 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15855 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15856 SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0));
15857 if (!A)
15858 return SDValue();
15859
15860 SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1));
15861 if (!B)
15862 return SDValue();
15863
15864 SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2));
15865 if (!C)
15866 return SDValue();
15867
15868 // This changes signaling nan behavior. If an input is a signaling nan, it
15869 // would have been quieted by the fpext originally. We don't care because
15870 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15871 // we would be worse off than just doing the promotion.
15872 SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
15873 SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
15874 SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
15875 return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
15876}
15877
15878unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15879 const SDNode *N0,
15880 const SDNode *N1) const {
15881 EVT VT = N0->getValueType(ResNo: 0);
15882
15883 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15884 // support denormals ever.
15885 if (((VT == MVT::f32 &&
15886 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) ||
15887 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15888 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) &&
15889 isOperationLegal(Op: ISD::FMAD, VT))
15890 return ISD::FMAD;
15891
15892 const TargetOptions &Options = DAG.getTarget().Options;
15893 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15894 (N0->getFlags().hasAllowContract() &&
15895 N1->getFlags().hasAllowContract())) &&
15896 isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
15897 return ISD::FMA;
15898 }
15899
15900 return 0;
15901}
15902
15903// For a reassociatable opcode perform:
15904// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15905SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15906 SelectionDAG &DAG) const {
15907 EVT VT = N->getValueType(ResNo: 0);
15908 if (VT != MVT::i32 && VT != MVT::i64)
15909 return SDValue();
15910
15911 if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0)))
15912 return SDValue();
15913
15914 unsigned Opc = N->getOpcode();
15915 SDValue Op0 = N->getOperand(Num: 0);
15916 SDValue Op1 = N->getOperand(Num: 1);
15917
15918 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15919 return SDValue();
15920
15921 if (Op0->isDivergent())
15922 std::swap(a&: Op0, b&: Op1);
15923
15924 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15925 return SDValue();
15926
15927 SDValue Op2 = Op1.getOperand(i: 1);
15928 Op1 = Op1.getOperand(i: 0);
15929 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15930 return SDValue();
15931
15932 if (Op1->isDivergent())
15933 std::swap(a&: Op1, b&: Op2);
15934
15935 SDLoc SL(N);
15936 SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
15937 return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
15938}
15939
15940static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15941 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15942 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
15943 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1);
15944 SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
15945 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
15946}
15947
15948// Fold
15949// y = lshr i64 x, 32
15950// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15951// with Const.hi == -1
15952// To
15953// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15954static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
15955 SDValue MulLHS, SDValue MulRHS,
15956 SDValue AddRHS) {
15957 if (MulRHS.getOpcode() == ISD::SRL)
15958 std::swap(a&: MulLHS, b&: MulRHS);
15959
15960 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15961 return SDValue();
15962
15963 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(Val: MulLHS.getOperand(i: 1));
15964 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15965 MulLHS.getOperand(i: 0) != AddRHS)
15966 return SDValue();
15967
15968 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: MulRHS.getNode());
15969 if (!Const || Hi_32(Value: Const->getZExtValue()) != uint32_t(-1))
15970 return SDValue();
15971
15972 SDValue ConstMul =
15973 DAG.getConstant(Val: Lo_32(Value: Const->getZExtValue()), DL: SL, VT: MVT::i32);
15974 return getMad64_32(DAG, SL, VT: MVT::i64,
15975 N0: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS), N1: ConstMul,
15976 N2: DAG.getZeroExtendInReg(Op: AddRHS, DL: SL, VT: MVT::i32), Signed: false);
15977}
15978
15979// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15980// multiplies, if any.
15981//
15982// Full 64-bit multiplies that feed into an addition are lowered here instead
15983// of using the generic expansion. The generic expansion ends up with
15984// a tree of ADD nodes that prevents us from using the "add" part of the
15985// MAD instruction. The expansion produced here results in a chain of ADDs
15986// instead of a tree.
15987SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15988 DAGCombinerInfo &DCI) const {
15989 assert(N->isAnyAdd());
15990
15991 SelectionDAG &DAG = DCI.DAG;
15992 EVT VT = N->getValueType(ResNo: 0);
15993 SDLoc SL(N);
15994 SDValue LHS = N->getOperand(Num: 0);
15995 SDValue RHS = N->getOperand(Num: 1);
15996
15997 if (VT.isVector())
15998 return SDValue();
15999
16000 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16001 // result in scalar registers for uniform values.
16002 if (!N->isDivergent() && Subtarget->hasSMulHi())
16003 return SDValue();
16004
16005 unsigned NumBits = VT.getScalarSizeInBits();
16006 if (NumBits <= 32 || NumBits > 64)
16007 return SDValue();
16008
16009 if (LHS.getOpcode() != ISD::MUL) {
16010 assert(RHS.getOpcode() == ISD::MUL);
16011 std::swap(a&: LHS, b&: RHS);
16012 }
16013
16014 // Avoid the fold if it would unduly increase the number of multiplies due to
16015 // multiple uses, except on hardware with full-rate multiply-add (which is
16016 // part of full-rate 64-bit ops).
16017 if (!Subtarget->hasFullRate64Ops()) {
16018 unsigned NumUsers = 0;
16019 for (SDNode *User : LHS->users()) {
16020 // There is a use that does not feed into addition, so the multiply can't
16021 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16022 if (!User->isAnyAdd())
16023 return SDValue();
16024
16025 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16026 // MUL + 3xADD + 3xADDC over 3xMAD.
16027 ++NumUsers;
16028 if (NumUsers >= 3)
16029 return SDValue();
16030 }
16031 }
16032
16033 SDValue MulLHS = LHS.getOperand(i: 0);
16034 SDValue MulRHS = LHS.getOperand(i: 1);
16035 SDValue AddRHS = RHS;
16036
16037 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16038 return FoldedMAD;
16039
16040 // Always check whether operands are small unsigned values, since that
16041 // knowledge is useful in more cases. Check for small signed values only if
16042 // doing so can unlock a shorter code sequence.
16043 bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32;
16044 bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32;
16045
16046 bool MulSignedLo = false;
16047 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16048 MulSignedLo =
16049 numBitsSigned(Op: MulLHS, DAG) <= 32 && numBitsSigned(Op: MulRHS, DAG) <= 32;
16050 }
16051
16052 // The operands and final result all have the same number of bits. If
16053 // operands need to be extended, they can be extended with garbage. The
16054 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16055 // truncated away in the end.
16056 if (VT != MVT::i64) {
16057 MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS);
16058 MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS);
16059 AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS);
16060 }
16061
16062 // The basic code generated is conceptually straightforward. Pseudo code:
16063 //
16064 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16065 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16066 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16067 //
16068 // The second and third lines are optional, depending on whether the factors
16069 // are {sign,zero}-extended or not.
16070 //
16071 // The actual DAG is noisier than the pseudo code, but only due to
16072 // instructions that disassemble values into low and high parts, and
16073 // assemble the final result.
16074 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
16075
16076 auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS);
16077 auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS);
16078 SDValue Accum =
16079 getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo);
16080
16081 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16082 auto [AccumLo, AccumHi] = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32);
16083
16084 if (!MulLHSUnsigned32) {
16085 auto MulLHSHi =
16086 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One);
16087 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo);
16088 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
16089 }
16090
16091 if (!MulRHSUnsigned32) {
16092 auto MulRHSHi =
16093 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One);
16094 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi);
16095 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
16096 }
16097
16098 Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi});
16099 Accum = DAG.getBitcast(VT: MVT::i64, V: Accum);
16100 }
16101
16102 if (VT != MVT::i64)
16103 Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
16104 return Accum;
16105}
16106
16107SDValue
16108SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16109 DAGCombinerInfo &DCI) const {
16110 SDValue RHS = N->getOperand(Num: 1);
16111 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
16112 if (!CRHS)
16113 return SDValue();
16114
16115 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16116 // common.
16117 uint64_t Val = CRHS->getZExtValue();
16118 if (countr_zero(Val) >= 32) {
16119 SelectionDAG &DAG = DCI.DAG;
16120 SDLoc SL(N);
16121 SDValue LHS = N->getOperand(Num: 0);
16122
16123 // Avoid carry machinery if we know the low half of the add does not
16124 // contribute to the final result.
16125 //
16126 // add i64:x, K if computeTrailingZeros(K) >= 32
16127 // => build_pair (add x.hi, K.hi), x.lo
16128
16129 // Breaking the 64-bit add here with this strange constant is unlikely
16130 // to interfere with addressing mode patterns.
16131
16132 SDValue Hi = getHiHalf64(Op: LHS, DAG);
16133 SDValue ConstHi32 = DAG.getConstant(Val: Hi_32(Value: Val), DL: SL, VT: MVT::i32);
16134 unsigned Opcode = N->getOpcode();
16135 if (Opcode == ISD::PTRADD)
16136 Opcode = ISD::ADD;
16137 SDValue AddHi =
16138 DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Hi, N2: ConstHi32, Flags: N->getFlags());
16139
16140 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: LHS);
16141 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: Lo, N2: AddHi);
16142 }
16143
16144 return SDValue();
16145}
16146
16147// Collect the ultimate src of each of the mul node's operands, and confirm
16148// each operand is 8 bytes.
16149static std::optional<ByteProvider<SDValue>>
16150handleMulOperand(const SDValue &MulOperand) {
16151 auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0);
16152 if (!Byte0 || Byte0->isConstantZero()) {
16153 return std::nullopt;
16154 }
16155 auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0);
16156 if (Byte1 && !Byte1->isConstantZero()) {
16157 return std::nullopt;
16158 }
16159 return Byte0;
16160}
16161
16162static unsigned addPermMasks(unsigned First, unsigned Second) {
16163 unsigned FirstCs = First & 0x0c0c0c0c;
16164 unsigned SecondCs = Second & 0x0c0c0c0c;
16165 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16166 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16167
16168 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16169 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16170 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16171 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16172
16173 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16174}
16175
16176struct DotSrc {
16177 SDValue SrcOp;
16178 int64_t PermMask;
16179 int64_t DWordOffset;
16180};
16181
16182static void placeSources(ByteProvider<SDValue> &Src0,
16183 ByteProvider<SDValue> &Src1,
16184 SmallVectorImpl<DotSrc> &Src0s,
16185 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16186
16187 assert(Src0.Src.has_value() && Src1.Src.has_value());
16188 // Src0s and Src1s are empty, just place arbitrarily.
16189 if (Step == 0) {
16190 Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16191 .DWordOffset: Src0.SrcOffset / 4});
16192 Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16193 .DWordOffset: Src1.SrcOffset / 4});
16194 return;
16195 }
16196
16197 for (int BPI = 0; BPI < 2; BPI++) {
16198 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16199 if (BPI == 1) {
16200 BPP = {Src1, Src0};
16201 }
16202 unsigned ZeroMask = 0x0c0c0c0c;
16203 unsigned FMask = 0xFF << (8 * (3 - Step));
16204
16205 unsigned FirstMask =
16206 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16207 unsigned SecondMask =
16208 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16209 // Attempt to find Src vector which contains our SDValue, if so, add our
16210 // perm mask to the existing one. If we are unable to find a match for the
16211 // first SDValue, attempt to find match for the second.
16212 int FirstGroup = -1;
16213 for (int I = 0; I < 2; I++) {
16214 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16215 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16216 return IterElt.SrcOp == *BPP.first.Src &&
16217 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16218 };
16219
16220 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
16221 if (Match != Srcs.end()) {
16222 Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
16223 FirstGroup = I;
16224 break;
16225 }
16226 }
16227 if (FirstGroup != -1) {
16228 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16229 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16230 return IterElt.SrcOp == *BPP.second.Src &&
16231 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16232 };
16233 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
16234 if (Match != Srcs.end()) {
16235 Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
16236 } else
16237 Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4});
16238 return;
16239 }
16240 }
16241
16242 // If we have made it here, then we could not find a match in Src0s or Src1s
16243 // for either Src0 or Src1, so just place them arbitrarily.
16244
16245 unsigned ZeroMask = 0x0c0c0c0c;
16246 unsigned FMask = 0xFF << (8 * (3 - Step));
16247
16248 Src0s.push_back(
16249 Elt: {.SrcOp: *Src0.Src,
16250 .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16251 .DWordOffset: Src0.SrcOffset / 4});
16252 Src1s.push_back(
16253 Elt: {.SrcOp: *Src1.Src,
16254 .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16255 .DWordOffset: Src1.SrcOffset / 4});
16256}
16257
16258static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
16259 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16260 bool IsAny) {
16261
16262 // If we just have one source, just permute it accordingly.
16263 if (Srcs.size() == 1) {
16264 auto *Elt = Srcs.begin();
16265 auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
16266
16267 // v_perm will produce the original value
16268 if (Elt->PermMask == 0x3020100)
16269 return EltOp;
16270
16271 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
16272 N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32));
16273 }
16274
16275 auto *FirstElt = Srcs.begin();
16276 auto *SecondElt = std::next(x: FirstElt);
16277
16278 SmallVector<SDValue, 2> Perms;
16279
16280 // If we have multiple sources in the chain, combine them via perms (using
16281 // calculated perm mask) and Ors.
16282 while (true) {
16283 auto FirstMask = FirstElt->PermMask;
16284 auto SecondMask = SecondElt->PermMask;
16285
16286 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16287 unsigned FirstPlusFour = FirstMask | 0x04040404;
16288 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16289 // original 0x0C.
16290 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16291
16292 auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
16293 auto FirstVal =
16294 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16295 auto SecondVal =
16296 getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
16297
16298 Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal,
16299 N2: SecondVal,
16300 N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32)));
16301
16302 FirstElt = std::next(x: SecondElt);
16303 if (FirstElt == Srcs.end())
16304 break;
16305
16306 SecondElt = std::next(x: FirstElt);
16307 // If we only have a FirstElt, then just combine that into the cumulative
16308 // source node.
16309 if (SecondElt == Srcs.end()) {
16310 auto EltOp =
16311 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16312
16313 Perms.push_back(
16314 Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
16315 N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32)));
16316 break;
16317 }
16318 }
16319
16320 assert(Perms.size() == 1 || Perms.size() == 2);
16321 return Perms.size() == 2
16322 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms[0], N2: Perms[1])
16323 : Perms[0];
16324}
16325
16326static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16327 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16328 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16329 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16330 EntryMask += ZeroMask;
16331 }
16332}
16333
16334static bool isMul(const SDValue Op) {
16335 auto Opcode = Op.getOpcode();
16336
16337 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16338 Opcode == AMDGPUISD::MUL_I24);
16339}
16340
16341static std::optional<bool>
16342checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
16343 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16344 const SDValue &S1Op, const SelectionDAG &DAG) {
16345 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16346 // of the dot4 is irrelevant.
16347 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16348 return false;
16349
16350 auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0);
16351 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16352 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16353 auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0);
16354 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16355 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16356
16357 assert(!(S0IsUnsigned && S0IsSigned));
16358 assert(!(S1IsUnsigned && S1IsSigned));
16359
16360 // There are 9 possible permutations of
16361 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16362
16363 // In two permutations, the sign bits are known to be the same for both Ops,
16364 // so simply return Signed / Unsigned corresponding to the MSB
16365
16366 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16367 return S0IsSigned;
16368
16369 // In another two permutations, the sign bits are known to be opposite. In
16370 // this case return std::nullopt to indicate a bad match.
16371
16372 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16373 return std::nullopt;
16374
16375 // In the remaining five permutations, we don't know the value of the sign
16376 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16377 // the upper bits must be extension bits. Thus, the only ways for the sign
16378 // bit to be unknown is if it was sign extended from unknown value, or if it
16379 // was any extended. In either case, it is correct to use the signed
16380 // version of the signedness semantics of dot4
16381
16382 // In two of such permutations, we known the sign bit is set for
16383 // one op, and the other is unknown. It is okay to used signed version of
16384 // dot4.
16385 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16386 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16387 return true;
16388
16389 // In one such permutation, we don't know either of the sign bits. It is okay
16390 // to used the signed version of dot4.
16391 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16392 return true;
16393
16394 // In two of such permutations, we known the sign bit is unset for
16395 // one op, and the other is unknown. Return std::nullopt to indicate a
16396 // bad match.
16397 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16398 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16399 return std::nullopt;
16400
16401 llvm_unreachable("Fully covered condition");
16402}
16403
16404SDValue SITargetLowering::performAddCombine(SDNode *N,
16405 DAGCombinerInfo &DCI) const {
16406 SelectionDAG &DAG = DCI.DAG;
16407 EVT VT = N->getValueType(ResNo: 0);
16408 SDLoc SL(N);
16409 SDValue LHS = N->getOperand(Num: 0);
16410 SDValue RHS = N->getOperand(Num: 1);
16411
16412 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16413 if (Subtarget->hasMad64_32()) {
16414 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16415 return Folded;
16416 }
16417 }
16418
16419 if (SDValue V = reassociateScalarOps(N, DAG)) {
16420 return V;
16421 }
16422
16423 if (VT == MVT::i64) {
16424 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16425 return Folded;
16426 }
16427
16428 if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
16429 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16430 SDValue TempNode(N, 0);
16431 std::optional<bool> IsSigned;
16432 SmallVector<DotSrc, 4> Src0s;
16433 SmallVector<DotSrc, 4> Src1s;
16434 SmallVector<SDValue, 4> Src2s;
16435
16436 // Match the v_dot4 tree, while collecting src nodes.
16437 int ChainLength = 0;
16438 for (int I = 0; I < 4; I++) {
16439 auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1;
16440 if (MulIdx == -1)
16441 break;
16442 auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0));
16443 if (!Src0)
16444 break;
16445 auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1));
16446 if (!Src1)
16447 break;
16448
16449 auto IterIsSigned = checkDot4MulSignedness(
16450 N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1,
16451 S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0),
16452 S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG);
16453 if (!IterIsSigned)
16454 break;
16455 if (!IsSigned)
16456 IsSigned = *IterIsSigned;
16457 if (*IterIsSigned != *IsSigned)
16458 break;
16459 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I);
16460 auto AddIdx = 1 - MulIdx;
16461 // Allow the special case where add (add (mul24, 0), mul24) became ->
16462 // add (mul24, mul24).
16463 if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) {
16464 Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx));
16465 auto Src0 =
16466 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0));
16467 if (!Src0)
16468 break;
16469 auto Src1 =
16470 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1));
16471 if (!Src1)
16472 break;
16473 auto IterIsSigned = checkDot4MulSignedness(
16474 N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1,
16475 S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0),
16476 S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG);
16477 if (!IterIsSigned)
16478 break;
16479 assert(IsSigned);
16480 if (*IterIsSigned != *IsSigned)
16481 break;
16482 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1);
16483 Src2s.push_back(Elt: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
16484 ChainLength = I + 2;
16485 break;
16486 }
16487
16488 TempNode = TempNode->getOperand(Num: AddIdx);
16489 Src2s.push_back(Elt: TempNode);
16490 ChainLength = I + 1;
16491 if (TempNode->getNumOperands() < 2)
16492 break;
16493 LHS = TempNode->getOperand(Num: 0);
16494 RHS = TempNode->getOperand(Num: 1);
16495 }
16496
16497 if (ChainLength < 2)
16498 return SDValue();
16499
16500 // Masks were constructed with assumption that we would find a chain of
16501 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16502 // 0x0c) so they do not affect dot calculation.
16503 if (ChainLength < 4) {
16504 fixMasks(Srcs&: Src0s, ChainLength);
16505 fixMasks(Srcs&: Src1s, ChainLength);
16506 }
16507
16508 SDValue Src0, Src1;
16509
16510 // If we are just using a single source for both, and have permuted the
16511 // bytes consistently, we can just use the sources without permuting
16512 // (commutation).
16513 bool UseOriginalSrc = false;
16514 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16515 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16516 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16517 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16518 SmallVector<unsigned, 4> SrcBytes;
16519 auto Src0Mask = Src0s.begin()->PermMask;
16520 SrcBytes.push_back(Elt: Src0Mask & 0xFF000000);
16521 bool UniqueEntries = true;
16522 for (auto I = 1; I < 4; I++) {
16523 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16524
16525 if (is_contained(Range&: SrcBytes, Element: NextByte)) {
16526 UniqueEntries = false;
16527 break;
16528 }
16529 SrcBytes.push_back(Elt: NextByte);
16530 }
16531
16532 if (UniqueEntries) {
16533 UseOriginalSrc = true;
16534
16535 auto *FirstElt = Src0s.begin();
16536 auto FirstEltOp =
16537 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16538
16539 auto *SecondElt = Src1s.begin();
16540 auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
16541 DWordOffset: SecondElt->DWordOffset);
16542
16543 Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
16544 VT: MVT::getIntegerVT(BitWidth: 32));
16545 Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
16546 VT: MVT::getIntegerVT(BitWidth: 32));
16547 }
16548 }
16549
16550 if (!UseOriginalSrc) {
16551 Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
16552 Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
16553 }
16554
16555 assert(IsSigned);
16556 SDValue Src2 =
16557 DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s[ChainLength - 1], DL: SL, VT: MVT::i32);
16558
16559 SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4
16560 : Intrinsic::amdgcn_udot4,
16561 DL: SL, VT: MVT::i64);
16562
16563 assert(!VT.isVector());
16564 auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0,
16565 N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
16566
16567 return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT);
16568 }
16569
16570 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16571 return SDValue();
16572
16573 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16574 // add x, sext (setcc) => usubo_carry x, 0, setcc
16575 unsigned Opc = LHS.getOpcode();
16576 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
16577 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
16578 std::swap(a&: RHS, b&: LHS);
16579
16580 Opc = RHS.getOpcode();
16581 switch (Opc) {
16582 default:
16583 break;
16584 case ISD::ZERO_EXTEND:
16585 case ISD::SIGN_EXTEND:
16586 case ISD::ANY_EXTEND: {
16587 auto Cond = RHS.getOperand(i: 0);
16588 // If this won't be a real VOPC output, we would still need to insert an
16589 // extra instruction anyway.
16590 if (!isBoolSGPR(V: Cond))
16591 break;
16592 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
16593 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
16594 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
16595 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
16596 }
16597 case ISD::UADDO_CARRY: {
16598 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16599 if (!isNullConstant(V: RHS.getOperand(i: 1)))
16600 break;
16601 SDValue Args[] = {LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2)};
16602 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args);
16603 }
16604 }
16605 return SDValue();
16606}
16607
16608SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16609 DAGCombinerInfo &DCI) const {
16610 SelectionDAG &DAG = DCI.DAG;
16611 SDLoc DL(N);
16612 EVT VT = N->getValueType(ResNo: 0);
16613 SDValue N0 = N->getOperand(Num: 0);
16614 SDValue N1 = N->getOperand(Num: 1);
16615
16616 // The following folds transform PTRADDs into regular arithmetic in cases
16617 // where the PTRADD wouldn't be folded as an immediate offset into memory
16618 // instructions anyway. They are target-specific in that other targets might
16619 // prefer to not lose information about the pointer arithmetic.
16620
16621 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16622 // Adapted from DAGCombiner::visitADDLikeCommutative.
16623 SDValue V, K;
16624 if (sd_match(N: N1, P: m_Shl(L: m_Neg(V: m_Value(N&: V)), R: m_Value(N&: K)))) {
16625 SDNodeFlags ShlFlags = N1->getFlags();
16626 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16627 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16628 // preserved.
16629 SDNodeFlags NewShlFlags =
16630 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16631 ? SDNodeFlags::NoSignedWrap
16632 : SDNodeFlags();
16633 SDValue Inner = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: V, N2: K, Flags: NewShlFlags);
16634 DCI.AddToWorklist(N: Inner.getNode());
16635 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: Inner);
16636 }
16637
16638 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16639 // performAddCombine.
16640 if (N1.getOpcode() == ISD::MUL) {
16641 if (Subtarget->hasMad64_32()) {
16642 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16643 return Folded;
16644 }
16645 }
16646
16647 // If the 32 low bits of the constant are all zero, there is nothing to fold
16648 // into an immediate offset, so it's better to eliminate the unnecessary
16649 // addition for the lower 32 bits than to preserve the PTRADD.
16650 // Analogous to a fold in performAddCombine.
16651 if (VT == MVT::i64) {
16652 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16653 return Folded;
16654 }
16655
16656 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16657 return SDValue();
16658
16659 SDValue X = N0;
16660 SDValue Y = N1.getOperand(i: 0);
16661 SDValue Z = N1.getOperand(i: 1);
16662 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Y);
16663 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Z);
16664
16665 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16666 Y->isDivergent() != Z->isDivergent()) {
16667 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16668 // y are uniform and z isn't.
16669 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16670 // z are uniform and y isn't.
16671 // The goal is to push uniform operands up in the computation, so that they
16672 // can be handled with scalar operations. We can't use reassociateScalarOps
16673 // for this since it requires two identical commutative operations to
16674 // reassociate.
16675 if (Y->isDivergent())
16676 std::swap(a&: Y, b&: Z);
16677 // If both additions in the original were NUW, reassociation preserves that.
16678 SDNodeFlags ReassocFlags =
16679 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16680 SDValue UniformInner = DAG.getMemBasePlusOffset(Base: X, Offset: Y, DL, Flags: ReassocFlags);
16681 DCI.AddToWorklist(N: UniformInner.getNode());
16682 return DAG.getMemBasePlusOffset(Base: UniformInner, Offset: Z, DL, Flags: ReassocFlags);
16683 }
16684
16685 return SDValue();
16686}
16687
16688SDValue SITargetLowering::performSubCombine(SDNode *N,
16689 DAGCombinerInfo &DCI) const {
16690 SelectionDAG &DAG = DCI.DAG;
16691 EVT VT = N->getValueType(ResNo: 0);
16692
16693 if (VT == MVT::i64) {
16694 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16695 return Folded;
16696 }
16697
16698 if (VT != MVT::i32)
16699 return SDValue();
16700
16701 SDLoc SL(N);
16702 SDValue LHS = N->getOperand(Num: 0);
16703 SDValue RHS = N->getOperand(Num: 1);
16704
16705 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16706 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16707 unsigned Opc = RHS.getOpcode();
16708 switch (Opc) {
16709 default:
16710 break;
16711 case ISD::ZERO_EXTEND:
16712 case ISD::SIGN_EXTEND:
16713 case ISD::ANY_EXTEND: {
16714 auto Cond = RHS.getOperand(i: 0);
16715 // If this won't be a real VOPC output, we would still need to insert an
16716 // extra instruction anyway.
16717 if (!isBoolSGPR(V: Cond))
16718 break;
16719 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
16720 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
16721 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
16722 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
16723 }
16724 }
16725
16726 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16727 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16728 if (!isNullConstant(V: LHS.getOperand(i: 1)))
16729 return SDValue();
16730 SDValue Args[] = {LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2)};
16731 return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args);
16732 }
16733 return SDValue();
16734}
16735
16736SDValue
16737SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16738 DAGCombinerInfo &DCI) const {
16739
16740 if (N->getValueType(ResNo: 0) != MVT::i32)
16741 return SDValue();
16742
16743 if (!isNullConstant(V: N->getOperand(Num: 1)))
16744 return SDValue();
16745
16746 SelectionDAG &DAG = DCI.DAG;
16747 SDValue LHS = N->getOperand(Num: 0);
16748
16749 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16750 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16751 unsigned LHSOpc = LHS.getOpcode();
16752 unsigned Opc = N->getOpcode();
16753 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16754 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16755 SDValue Args[] = {LHS.getOperand(i: 0), LHS.getOperand(i: 1), N->getOperand(Num: 2)};
16756 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), Ops: Args);
16757 }
16758 return SDValue();
16759}
16760
16761SDValue SITargetLowering::performFAddCombine(SDNode *N,
16762 DAGCombinerInfo &DCI) const {
16763 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16764 return SDValue();
16765
16766 SelectionDAG &DAG = DCI.DAG;
16767 EVT VT = N->getValueType(ResNo: 0);
16768
16769 SDLoc SL(N);
16770 SDValue LHS = N->getOperand(Num: 0);
16771 SDValue RHS = N->getOperand(Num: 1);
16772
16773 // These should really be instruction patterns, but writing patterns with
16774 // source modifiers is a pain.
16775
16776 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16777 if (LHS.getOpcode() == ISD::FADD) {
16778 SDValue A = LHS.getOperand(i: 0);
16779 if (A == LHS.getOperand(i: 1)) {
16780 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
16781 if (FusedOp != 0) {
16782 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16783 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
16784 }
16785 }
16786 }
16787
16788 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16789 if (RHS.getOpcode() == ISD::FADD) {
16790 SDValue A = RHS.getOperand(i: 0);
16791 if (A == RHS.getOperand(i: 1)) {
16792 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
16793 if (FusedOp != 0) {
16794 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16795 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
16796 }
16797 }
16798 }
16799
16800 return SDValue();
16801}
16802
16803SDValue SITargetLowering::performFSubCombine(SDNode *N,
16804 DAGCombinerInfo &DCI) const {
16805 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16806 return SDValue();
16807
16808 SelectionDAG &DAG = DCI.DAG;
16809 SDLoc SL(N);
16810 EVT VT = N->getValueType(ResNo: 0);
16811 assert(!VT.isVector());
16812
16813 // Try to get the fneg to fold into the source modifier. This undoes generic
16814 // DAG combines and folds them into the mad.
16815 //
16816 // Only do this if we are not trying to support denormals. v_mad_f32 does
16817 // not support denormals ever.
16818 SDValue LHS = N->getOperand(Num: 0);
16819 SDValue RHS = N->getOperand(Num: 1);
16820 if (LHS.getOpcode() == ISD::FADD) {
16821 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16822 SDValue A = LHS.getOperand(i: 0);
16823 if (A == LHS.getOperand(i: 1)) {
16824 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
16825 if (FusedOp != 0) {
16826 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16827 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
16828
16829 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
16830 }
16831 }
16832 }
16833
16834 if (RHS.getOpcode() == ISD::FADD) {
16835 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16836
16837 SDValue A = RHS.getOperand(i: 0);
16838 if (A == RHS.getOperand(i: 1)) {
16839 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
16840 if (FusedOp != 0) {
16841 const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT);
16842 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
16843 }
16844 }
16845 }
16846
16847 return SDValue();
16848}
16849
16850SDValue SITargetLowering::performFDivCombine(SDNode *N,
16851 DAGCombinerInfo &DCI) const {
16852 SelectionDAG &DAG = DCI.DAG;
16853 SDLoc SL(N);
16854 EVT VT = N->getValueType(ResNo: 0);
16855
16856 // fsqrt legality correlates to rsq availability.
16857 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(Op: ISD::FSQRT, VT))
16858 return SDValue();
16859
16860 SDValue LHS = N->getOperand(Num: 0);
16861 SDValue RHS = N->getOperand(Num: 1);
16862
16863 SDNodeFlags Flags = N->getFlags();
16864 SDNodeFlags RHSFlags = RHS->getFlags();
16865 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16866 !RHS->hasOneUse())
16867 return SDValue();
16868
16869 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
16870 bool IsNegative = false;
16871 if (CLHS->isExactlyValue(V: 1.0) ||
16872 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
16873 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16874 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16875 if (RHS.getOpcode() == ISD::FSQRT) {
16876 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16877 SDValue Rsq =
16878 DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: 0), Flags);
16879 return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
16880 }
16881 }
16882 }
16883
16884 return SDValue();
16885}
16886
16887SDValue SITargetLowering::performFMulCombine(SDNode *N,
16888 DAGCombinerInfo &DCI) const {
16889 SelectionDAG &DAG = DCI.DAG;
16890 EVT VT = N->getValueType(ResNo: 0);
16891 EVT ScalarVT = VT.getScalarType();
16892 EVT IntVT = VT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
16893
16894 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16895 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16896 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16897 return SDValue();
16898 }
16899
16900 SDValue LHS = N->getOperand(Num: 0);
16901 SDValue RHS = N->getOperand(Num: 1);
16902
16903 // It is cheaper to realize i32 inline constants as compared against
16904 // materializing f16 or f64 (or even non-inline f32) values,
16905 // possible via ldexp usage, as shown below :
16906 //
16907 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16908 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16909 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16910 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16911 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16912 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(N: RHS.getOperand(i: 1));
16913 if (!TrueNode)
16914 return SDValue();
16915 const ConstantFPSDNode *FalseNode =
16916 isConstOrConstSplatFP(N: RHS.getOperand(i: 2));
16917 if (!FalseNode)
16918 return SDValue();
16919
16920 if (TrueNode->isNegative() != FalseNode->isNegative())
16921 return SDValue();
16922
16923 // For f32, only non-inline constants should be transformed.
16924 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16925 if (ScalarVT == MVT::f32 &&
16926 TII->isInlineConstant(Imm: TrueNode->getValueAPF()) &&
16927 TII->isInlineConstant(Imm: FalseNode->getValueAPF()))
16928 return SDValue();
16929
16930 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16931 if (TrueNodeExpVal == INT_MIN)
16932 return SDValue();
16933 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16934 if (FalseNodeExpVal == INT_MIN)
16935 return SDValue();
16936
16937 SDLoc SL(N);
16938 SDValue SelectNode =
16939 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: IntVT, N1: RHS.getOperand(i: 0),
16940 N2: DAG.getSignedConstant(Val: TrueNodeExpVal, DL: SL, VT: IntVT),
16941 N3: DAG.getSignedConstant(Val: FalseNodeExpVal, DL: SL, VT: IntVT));
16942
16943 LHS = TrueNode->isNegative()
16944 ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS, Flags: LHS->getFlags())
16945 : LHS;
16946
16947 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: LHS, N2: SelectNode, Flags: N->getFlags());
16948 }
16949
16950 return SDValue();
16951}
16952
16953SDValue SITargetLowering::performFMACombine(SDNode *N,
16954 DAGCombinerInfo &DCI) const {
16955 SelectionDAG &DAG = DCI.DAG;
16956 EVT VT = N->getValueType(ResNo: 0);
16957 SDLoc SL(N);
16958
16959 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16960 return SDValue();
16961
16962 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16963 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16964 SDValue Op1 = N->getOperand(Num: 0);
16965 SDValue Op2 = N->getOperand(Num: 1);
16966 SDValue FMA = N->getOperand(Num: 2);
16967
16968 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16969 Op2.getOpcode() != ISD::FP_EXTEND)
16970 return SDValue();
16971
16972 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16973 // regardless of the denorm mode setting. Therefore,
16974 // fp-contract is sufficient to allow generating fdot2.
16975 const TargetOptions &Options = DAG.getTarget().Options;
16976 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16977 (N->getFlags().hasAllowContract() &&
16978 FMA->getFlags().hasAllowContract())) {
16979 Op1 = Op1.getOperand(i: 0);
16980 Op2 = Op2.getOperand(i: 0);
16981 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16982 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16983 return SDValue();
16984
16985 SDValue Vec1 = Op1.getOperand(i: 0);
16986 SDValue Idx1 = Op1.getOperand(i: 1);
16987 SDValue Vec2 = Op2.getOperand(i: 0);
16988
16989 SDValue FMAOp1 = FMA.getOperand(i: 0);
16990 SDValue FMAOp2 = FMA.getOperand(i: 1);
16991 SDValue FMAAcc = FMA.getOperand(i: 2);
16992
16993 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16994 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16995 return SDValue();
16996
16997 FMAOp1 = FMAOp1.getOperand(i: 0);
16998 FMAOp2 = FMAOp2.getOperand(i: 0);
16999 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17000 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17001 return SDValue();
17002
17003 SDValue Vec3 = FMAOp1.getOperand(i: 0);
17004 SDValue Vec4 = FMAOp2.getOperand(i: 0);
17005 SDValue Idx2 = FMAOp1.getOperand(i: 1);
17006
17007 if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) ||
17008 // Idx1 and Idx2 cannot be the same.
17009 Idx1 == Idx2)
17010 return SDValue();
17011
17012 if (Vec1 == Vec2 || Vec3 == Vec4)
17013 return SDValue();
17014
17015 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17016 return SDValue();
17017
17018 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17019 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc,
17020 N4: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
17021 }
17022 }
17023 return SDValue();
17024}
17025
17026SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17027 DAGCombinerInfo &DCI) const {
17028 SelectionDAG &DAG = DCI.DAG;
17029 SDLoc SL(N);
17030
17031 SDValue LHS = N->getOperand(Num: 0);
17032 SDValue RHS = N->getOperand(Num: 1);
17033 EVT VT = LHS.getValueType();
17034 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
17035
17036 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
17037 if (!CRHS) {
17038 CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
17039 if (CRHS) {
17040 std::swap(a&: LHS, b&: RHS);
17041 CC = getSetCCSwappedOperands(Operation: CC);
17042 }
17043 }
17044
17045 if (CRHS) {
17046 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17047 isBoolSGPR(V: LHS.getOperand(i: 0))) {
17048 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
17049 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
17050 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
17051 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
17052 if ((CRHS->isAllOnes() &&
17053 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
17054 (CRHS->isZero() &&
17055 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
17056 return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
17057 N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
17058 if ((CRHS->isAllOnes() &&
17059 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
17060 (CRHS->isZero() &&
17061 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
17062 return LHS.getOperand(i: 0);
17063 }
17064
17065 const APInt &CRHSVal = CRHS->getAPIntValue();
17066 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
17067 LHS.getOpcode() == ISD::SELECT &&
17068 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
17069 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) &&
17070 isBoolSGPR(V: LHS.getOperand(i: 0))) {
17071 // Given CT != FT:
17072 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
17073 // setcc (select cc, CT, CF), CF, ne => cc
17074 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
17075 // setcc (select cc, CT, CF), CT, eq => cc
17076 const APInt &CT = LHS.getConstantOperandAPInt(i: 1);
17077 const APInt &CF = LHS.getConstantOperandAPInt(i: 2);
17078
17079 if (CT != CF) {
17080 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
17081 (CT == CRHSVal && CC == ISD::SETNE))
17082 return DAG.getNOT(DL: SL, Val: LHS.getOperand(i: 0), VT: MVT::i1);
17083 if ((CF == CRHSVal && CC == ISD::SETNE) ||
17084 (CT == CRHSVal && CC == ISD::SETEQ))
17085 return LHS.getOperand(i: 0);
17086 }
17087 }
17088 }
17089
17090 // Eliminate setcc by using carryout from add/sub instruction
17091
17092 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
17093 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
17094 // similarly for subtraction
17095
17096 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
17097 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
17098
17099 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
17100 sd_match(N: LHS, P: m_Add(L: m_Specific(N: RHS), R: m_Value()))) ||
17101 (CC == ISD::SETUGT &&
17102 sd_match(N: LHS, P: m_Sub(L: m_Specific(N: RHS), R: m_Value()))) ||
17103 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
17104 sd_match(N: LHS, P: m_Add(L: m_Value(), R: m_One()))))) {
17105 bool IsAdd = LHS.getOpcode() == ISD::ADD;
17106
17107 SDValue Op0 = LHS.getOperand(i: 0);
17108 SDValue Op1 = LHS.getOperand(i: 1);
17109
17110 SDValue Op0Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op0);
17111 SDValue Op1Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op1);
17112
17113 SDValue Op0Hi = getHiHalf64(Op: Op0, DAG);
17114 SDValue Op1Hi = getHiHalf64(Op: Op1, DAG);
17115
17116 SDValue NodeLo =
17117 DAG.getNode(Opcode: IsAdd ? ISD::UADDO : ISD::USUBO, DL: SL,
17118 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1), Ops: {Op0Lo, Op1Lo});
17119
17120 SDValue CarryInHi = NodeLo.getValue(R: 1);
17121 SDValue NodeHi = DAG.getNode(Opcode: IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
17122 DL: SL, VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1),
17123 Ops: {Op0Hi, Op1Hi, CarryInHi});
17124
17125 SDValue ResultLo = NodeLo.getValue(R: 0);
17126 SDValue ResultHi = NodeHi.getValue(R: 0);
17127
17128 SDValue JoinedResult =
17129 DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {ResultLo, ResultHi});
17130
17131 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: JoinedResult);
17132 SDValue Overflow = NodeHi.getValue(R: 1);
17133 DCI.CombineTo(N: LHS.getNode(), Res: Result);
17134 return Overflow;
17135 }
17136
17137 if (VT != MVT::f32 && VT != MVT::f64 &&
17138 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17139 return SDValue();
17140
17141 // Match isinf/isfinite pattern
17142 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
17143 // (fcmp one (fabs x), inf) -> (fp_class x,
17144 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
17145 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
17146 LHS.getOpcode() == ISD::FABS) {
17147 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
17148 if (!CRHS)
17149 return SDValue();
17150
17151 const APFloat &APF = CRHS->getValueAPF();
17152 if (APF.isInfinity() && !APF.isNegative()) {
17153 const unsigned IsInfMask =
17154 SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
17155 const unsigned IsFiniteMask =
17156 SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |
17157 SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |
17158 SIInstrFlags::P_SUBNORMAL;
17159 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
17160 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
17161 N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32));
17162 }
17163 }
17164
17165 return SDValue();
17166}
17167
17168SDValue
17169SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
17170 DAGCombinerInfo &DCI) const {
17171 SelectionDAG &DAG = DCI.DAG;
17172 SDLoc SL(N);
17173 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17174
17175 SDValue Src = N->getOperand(Num: 0);
17176 SDValue Shift = N->getOperand(Num: 0);
17177
17178 // TODO: Extend type shouldn't matter (assuming legal types).
17179 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
17180 Shift = Shift.getOperand(i: 0);
17181
17182 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
17183 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
17184 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
17185 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
17186 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
17187 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
17188 if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) {
17189 SDValue Shifted = DAG.getZExtOrTrunc(
17190 Op: Shift.getOperand(i: 0), DL: SDLoc(Shift.getOperand(i: 0)), VT: MVT::i32);
17191
17192 unsigned ShiftOffset = 8 * Offset;
17193 if (Shift.getOpcode() == ISD::SHL)
17194 ShiftOffset -= C->getZExtValue();
17195 else
17196 ShiftOffset += C->getZExtValue();
17197
17198 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17199 return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, DL: SL,
17200 VT: MVT::f32, Operand: Shifted);
17201 }
17202 }
17203 }
17204
17205 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17206 APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8);
17207 if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
17208 // We simplified Src. If this node is not dead, visit it again so it is
17209 // folded properly.
17210 if (N->getOpcode() != ISD::DELETED_NODE)
17211 DCI.AddToWorklist(N);
17212 return SDValue(N, 0);
17213 }
17214
17215 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
17216 if (SDValue DemandedSrc =
17217 TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG))
17218 return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc);
17219
17220 return SDValue();
17221}
17222
17223SDValue SITargetLowering::performClampCombine(SDNode *N,
17224 DAGCombinerInfo &DCI) const {
17225 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
17226 if (!CSrc)
17227 return SDValue();
17228
17229 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17230 const APFloat &F = CSrc->getValueAPF();
17231 APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
17232 if (F < Zero ||
17233 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17234 return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
17235 }
17236
17237 APFloat One(F.getSemantics(), "1.0");
17238 if (F > One)
17239 return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
17240
17241 return SDValue(CSrc, 0);
17242}
17243
17244SDValue SITargetLowering::performSelectCombine(SDNode *N,
17245 DAGCombinerInfo &DCI) const {
17246
17247 // Try to fold CMP + SELECT patterns with shared constants (both FP and
17248 // integer).
17249 // Detect when CMP and SELECT use the same constant and fold them to avoid
17250 // loading the constant twice. Specifically handles patterns like:
17251 // %cmp = icmp eq i32 %val, 4242
17252 // %sel = select i1 %cmp, i32 4242, i32 %other
17253 // It can be optimized to reuse %val instead of 4242 in select.
17254 SDValue Cond = N->getOperand(Num: 0);
17255 SDValue TrueVal = N->getOperand(Num: 1);
17256 SDValue FalseVal = N->getOperand(Num: 2);
17257
17258 // Check if condition is a comparison.
17259 if (Cond.getOpcode() != ISD::SETCC)
17260 return SDValue();
17261
17262 SDValue LHS = Cond.getOperand(i: 0);
17263 SDValue RHS = Cond.getOperand(i: 1);
17264 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
17265
17266 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17267 bool isInteger = LHS.getValueType().isInteger();
17268
17269 // Handle simple floating-point and integer types only.
17270 if (!isFloatingPoint && !isInteger)
17271 return SDValue();
17272
17273 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17274 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17275 if (!isEquality && !isNonEquality)
17276 return SDValue();
17277
17278 SDValue ArgVal, ConstVal;
17279 if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: RHS)) ||
17280 (isInteger && isa<ConstantSDNode>(Val: RHS))) {
17281 ConstVal = RHS;
17282 ArgVal = LHS;
17283 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: LHS)) ||
17284 (isInteger && isa<ConstantSDNode>(Val: LHS))) {
17285 ConstVal = LHS;
17286 ArgVal = RHS;
17287 } else {
17288 return SDValue();
17289 }
17290
17291 // Skip optimization for inlinable immediates.
17292 if (isFloatingPoint) {
17293 const APFloat &Val = cast<ConstantFPSDNode>(Val&: ConstVal)->getValueAPF();
17294 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Imm: Val))
17295 return SDValue();
17296 } else {
17297 if (AMDGPU::isInlinableIntLiteral(
17298 Literal: cast<ConstantSDNode>(Val&: ConstVal)->getSExtValue()))
17299 return SDValue();
17300 }
17301
17302 // For equality and non-equality comparisons, patterns:
17303 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17304 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17305 if (!(isEquality && TrueVal == ConstVal) &&
17306 !(isNonEquality && FalseVal == ConstVal))
17307 return SDValue();
17308
17309 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17310 SDValue SelectRHS =
17311 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17312 return DCI.DAG.getNode(Opcode: ISD::SELECT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Cond,
17313 N2: SelectLHS, N3: SelectRHS);
17314}
17315
17316SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
17317 DAGCombinerInfo &DCI) const {
17318 switch (N->getOpcode()) {
17319 case ISD::ADD:
17320 case ISD::SUB:
17321 case ISD::SHL:
17322 case ISD::SRL:
17323 case ISD::SRA:
17324 case ISD::AND:
17325 case ISD::OR:
17326 case ISD::XOR:
17327 case ISD::MUL:
17328 case ISD::SETCC:
17329 case ISD::SELECT:
17330 case ISD::SMIN:
17331 case ISD::SMAX:
17332 case ISD::UMIN:
17333 case ISD::UMAX:
17334 if (auto Res = promoteUniformOpToI32(Op: SDValue(N, 0), DCI))
17335 return Res;
17336 break;
17337 default:
17338 break;
17339 }
17340
17341 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17342 return SDValue();
17343
17344 switch (N->getOpcode()) {
17345 case ISD::ADD:
17346 return performAddCombine(N, DCI);
17347 case ISD::PTRADD:
17348 return performPtrAddCombine(N, DCI);
17349 case ISD::SUB:
17350 return performSubCombine(N, DCI);
17351 case ISD::UADDO_CARRY:
17352 case ISD::USUBO_CARRY:
17353 return performAddCarrySubCarryCombine(N, DCI);
17354 case ISD::FADD:
17355 return performFAddCombine(N, DCI);
17356 case ISD::FSUB:
17357 return performFSubCombine(N, DCI);
17358 case ISD::FDIV:
17359 return performFDivCombine(N, DCI);
17360 case ISD::FMUL:
17361 return performFMulCombine(N, DCI);
17362 case ISD::SETCC:
17363 return performSetCCCombine(N, DCI);
17364 case ISD::SELECT:
17365 if (auto Res = performSelectCombine(N, DCI))
17366 return Res;
17367 break;
17368 case ISD::FMAXNUM:
17369 case ISD::FMINNUM:
17370 case ISD::FMAXNUM_IEEE:
17371 case ISD::FMINNUM_IEEE:
17372 case ISD::FMAXIMUM:
17373 case ISD::FMINIMUM:
17374 case ISD::FMAXIMUMNUM:
17375 case ISD::FMINIMUMNUM:
17376 case ISD::SMAX:
17377 case ISD::SMIN:
17378 case ISD::UMAX:
17379 case ISD::UMIN:
17380 case AMDGPUISD::FMIN_LEGACY:
17381 case AMDGPUISD::FMAX_LEGACY:
17382 return performMinMaxCombine(N, DCI);
17383 case ISD::FMA:
17384 return performFMACombine(N, DCI);
17385 case ISD::AND:
17386 return performAndCombine(N, DCI);
17387 case ISD::OR:
17388 return performOrCombine(N, DCI);
17389 case ISD::FSHR: {
17390 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17391 if (N->getValueType(ResNo: 0) == MVT::i32 && N->isDivergent() &&
17392 TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
17393 return matchPERM(N, DCI);
17394 }
17395 break;
17396 }
17397 case ISD::XOR:
17398 return performXorCombine(N, DCI);
17399 case ISD::ANY_EXTEND:
17400 case ISD::ZERO_EXTEND:
17401 return performZeroOrAnyExtendCombine(N, DCI);
17402 case ISD::SIGN_EXTEND_INREG:
17403 return performSignExtendInRegCombine(N, DCI);
17404 case AMDGPUISD::FP_CLASS:
17405 return performClassCombine(N, DCI);
17406 case ISD::FCANONICALIZE:
17407 return performFCanonicalizeCombine(N, DCI);
17408 case AMDGPUISD::RCP:
17409 return performRcpCombine(N, DCI);
17410 case ISD::FLDEXP:
17411 case AMDGPUISD::FRACT:
17412 case AMDGPUISD::RSQ:
17413 case AMDGPUISD::RCP_LEGACY:
17414 case AMDGPUISD::RCP_IFLAG:
17415 case AMDGPUISD::RSQ_CLAMP: {
17416 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17417 SDValue Src = N->getOperand(Num: 0);
17418 if (Src.isUndef())
17419 return Src;
17420 break;
17421 }
17422 case ISD::SINT_TO_FP:
17423 case ISD::UINT_TO_FP:
17424 return performUCharToFloatCombine(N, DCI);
17425 case ISD::FCOPYSIGN:
17426 return performFCopySignCombine(N, DCI);
17427 case AMDGPUISD::CVT_F32_UBYTE0:
17428 case AMDGPUISD::CVT_F32_UBYTE1:
17429 case AMDGPUISD::CVT_F32_UBYTE2:
17430 case AMDGPUISD::CVT_F32_UBYTE3:
17431 return performCvtF32UByteNCombine(N, DCI);
17432 case AMDGPUISD::FMED3:
17433 return performFMed3Combine(N, DCI);
17434 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17435 return performCvtPkRTZCombine(N, DCI);
17436 case AMDGPUISD::CLAMP:
17437 return performClampCombine(N, DCI);
17438 case ISD::SCALAR_TO_VECTOR: {
17439 SelectionDAG &DAG = DCI.DAG;
17440 EVT VT = N->getValueType(ResNo: 0);
17441
17442 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17443 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17444 SDLoc SL(N);
17445 SDValue Src = N->getOperand(Num: 0);
17446 EVT EltVT = Src.getValueType();
17447 if (EltVT != MVT::i16)
17448 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src);
17449
17450 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src);
17451 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
17452 }
17453
17454 break;
17455 }
17456 case ISD::EXTRACT_VECTOR_ELT:
17457 return performExtractVectorEltCombine(N, DCI);
17458 case ISD::INSERT_VECTOR_ELT:
17459 return performInsertVectorEltCombine(N, DCI);
17460 case ISD::FP_ROUND:
17461 return performFPRoundCombine(N, DCI);
17462 case ISD::LOAD: {
17463 if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
17464 return Widened;
17465 [[fallthrough]];
17466 }
17467 default: {
17468 if (!DCI.isBeforeLegalize()) {
17469 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
17470 return performMemSDNodeCombine(N: MemNode, DCI);
17471 }
17472
17473 break;
17474 }
17475 }
17476
17477 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
17478}
17479
17480/// Helper function for adjustWritemask
17481static unsigned SubIdx2Lane(unsigned Idx) {
17482 switch (Idx) {
17483 default:
17484 return ~0u;
17485 case AMDGPU::sub0:
17486 return 0;
17487 case AMDGPU::sub1:
17488 return 1;
17489 case AMDGPU::sub2:
17490 return 2;
17491 case AMDGPU::sub3:
17492 return 3;
17493 case AMDGPU::sub4:
17494 return 4; // Possible with TFE/LWE
17495 }
17496}
17497
17498/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17499SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17500 SelectionDAG &DAG) const {
17501 unsigned Opcode = Node->getMachineOpcode();
17502
17503 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17504 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::d16) - 1;
17505 if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx))
17506 return Node; // not implemented for D16
17507
17508 SDNode *Users[5] = {nullptr};
17509 unsigned Lane = 0;
17510 unsigned DmaskIdx =
17511 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::dmask) - 1;
17512 unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
17513 unsigned NewDmask = 0;
17514 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::tfe) - 1;
17515 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::lwe) - 1;
17516 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) ||
17517 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx));
17518 unsigned TFCLane = 0;
17519 bool HasChain = Node->getNumValues() > 1;
17520
17521 if (OldDmask == 0) {
17522 // These are folded out, but on the chance it happens don't assert.
17523 return Node;
17524 }
17525
17526 unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
17527 // Work out which is the TFE/LWE lane if that is enabled.
17528 if (UsesTFC) {
17529 TFCLane = OldBitsSet;
17530 }
17531
17532 // Try to figure out the used register components
17533 for (SDUse &Use : Node->uses()) {
17534
17535 // Don't look at users of the chain.
17536 if (Use.getResNo() != 0)
17537 continue;
17538
17539 SDNode *User = Use.getUser();
17540
17541 // Abort if we can't understand the usage
17542 if (!User->isMachineOpcode() ||
17543 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17544 return Node;
17545
17546 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17547 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17548 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17549 // set, etc.
17550 Lane = SubIdx2Lane(Idx: User->getConstantOperandVal(Num: 1));
17551 if (Lane == ~0u)
17552 return Node;
17553
17554 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17555 if (UsesTFC && Lane == TFCLane) {
17556 Users[Lane] = User;
17557 } else {
17558 // Set which texture component corresponds to the lane.
17559 unsigned Comp;
17560 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17561 Comp = llvm::countr_zero(Val: Dmask);
17562 Dmask &= ~(1 << Comp);
17563 }
17564
17565 // Abort if we have more than one user per component.
17566 if (Users[Lane])
17567 return Node;
17568
17569 Users[Lane] = User;
17570 NewDmask |= 1 << Comp;
17571 }
17572 }
17573
17574 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17575 bool NoChannels = !NewDmask;
17576 if (NoChannels) {
17577 if (!UsesTFC) {
17578 // No uses of the result and not using TFC. Then do nothing.
17579 return Node;
17580 }
17581 // If the original dmask has one channel - then nothing to do
17582 if (OldBitsSet == 1)
17583 return Node;
17584 // Use an arbitrary dmask - required for the instruction to work
17585 NewDmask = 1;
17586 }
17587 // Abort if there's no change
17588 if (NewDmask == OldDmask)
17589 return Node;
17590
17591 unsigned BitsSet = llvm::popcount(Value: NewDmask);
17592
17593 // Check for TFE or LWE - increase the number of channels by one to account
17594 // for the extra return value
17595 // This will need adjustment for D16 if this is also included in
17596 // adjustWriteMask (this function) but at present D16 are excluded.
17597 unsigned NewChannels = BitsSet + UsesTFC;
17598
17599 int NewOpcode =
17600 AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
17601 assert(NewOpcode != -1 &&
17602 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17603 "failed to find equivalent MIMG op");
17604
17605 // Adjust the writemask in the node
17606 SmallVector<SDValue, 12> Ops;
17607 llvm::append_range(C&: Ops, R: Node->ops().take_front(N: DmaskIdx));
17608 Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc(Node), VT: MVT::i32));
17609 llvm::append_range(C&: Ops, R: Node->ops().drop_front(N: DmaskIdx + 1));
17610
17611 MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT();
17612
17613 MVT ResultVT = NewChannels == 1
17614 ? SVT
17615 : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4
17616 : NewChannels == 5 ? 8
17617 : NewChannels);
17618 SDVTList NewVTList =
17619 HasChain ? DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT);
17620
17621 MachineSDNode *NewNode =
17622 DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node), VTs: NewVTList, Ops);
17623
17624 if (HasChain) {
17625 // Update chain.
17626 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
17627 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1));
17628 }
17629
17630 if (NewChannels == 1) {
17631 assert(Node->hasNUsesOfValue(1, 0));
17632 SDNode *Copy =
17633 DAG.getMachineNode(Opcode: TargetOpcode::COPY, dl: SDLoc(Node),
17634 VT: Users[Lane]->getValueType(ResNo: 0), Op1: SDValue(NewNode, 0));
17635 DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
17636 return nullptr;
17637 }
17638
17639 // Update the users of the node with the new indices
17640 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17641 SDNode *User = Users[i];
17642 if (!User) {
17643 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17644 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17645 if (i || !NoChannels)
17646 continue;
17647 } else {
17648 SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc(User), VT: MVT::i32);
17649 SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op);
17650 if (NewUser != User) {
17651 DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0));
17652 DAG.RemoveDeadNode(N: User);
17653 }
17654 }
17655
17656 switch (Idx) {
17657 default:
17658 break;
17659 case AMDGPU::sub0:
17660 Idx = AMDGPU::sub1;
17661 break;
17662 case AMDGPU::sub1:
17663 Idx = AMDGPU::sub2;
17664 break;
17665 case AMDGPU::sub2:
17666 Idx = AMDGPU::sub3;
17667 break;
17668 case AMDGPU::sub3:
17669 Idx = AMDGPU::sub4;
17670 break;
17671 }
17672 }
17673
17674 DAG.RemoveDeadNode(N: Node);
17675 return nullptr;
17676}
17677
17678static bool isFrameIndexOp(SDValue Op) {
17679 if (Op.getOpcode() == ISD::AssertZext)
17680 Op = Op.getOperand(i: 0);
17681
17682 return isa<FrameIndexSDNode>(Val: Op);
17683}
17684
17685/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17686/// with frame index operands.
17687/// LLVM assumes that inputs are to these instructions are registers.
17688SDNode *
17689SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
17690 SelectionDAG &DAG) const {
17691 if (Node->getOpcode() == ISD::CopyToReg) {
17692 RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1));
17693 SDValue SrcVal = Node->getOperand(Num: 2);
17694
17695 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17696 // to try understanding copies to physical registers.
17697 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17698 SDLoc SL(Node);
17699 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
17700 SDValue VReg = DAG.getRegister(
17701 Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1);
17702
17703 SDNode *Glued = Node->getGluedNode();
17704 SDValue ToVReg = DAG.getCopyToReg(
17705 Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal,
17706 Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17707 SDValue ToResultReg = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0),
17708 N: VReg, Glue: ToVReg.getValue(R: 1));
17709 DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
17710 DAG.RemoveDeadNode(N: Node);
17711 return ToResultReg.getNode();
17712 }
17713 }
17714
17715 SmallVector<SDValue, 8> Ops;
17716 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17717 if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
17718 Ops.push_back(Elt: Node->getOperand(Num: i));
17719 continue;
17720 }
17721
17722 SDLoc DL(Node);
17723 Ops.push_back(Elt: SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL,
17724 VT: Node->getOperand(Num: i).getValueType(),
17725 Op1: Node->getOperand(Num: i)),
17726 0));
17727 }
17728
17729 return DAG.UpdateNodeOperands(N: Node, Ops);
17730}
17731
17732/// Fold the instructions after selecting them.
17733/// Returns null if users were already updated.
17734SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
17735 SelectionDAG &DAG) const {
17736 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17737 unsigned Opcode = Node->getMachineOpcode();
17738
17739 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17740 !TII->isGather4(Opcode) &&
17741 AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) {
17742 return adjustWritemask(Node, DAG);
17743 }
17744
17745 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17746 legalizeTargetIndependentNode(Node, DAG);
17747 return Node;
17748 }
17749
17750 switch (Opcode) {
17751 case AMDGPU::V_DIV_SCALE_F32_e64:
17752 case AMDGPU::V_DIV_SCALE_F64_e64: {
17753 // Satisfy the operand register constraint when one of the inputs is
17754 // undefined. Ordinarily each undef value will have its own implicit_def of
17755 // a vreg, so force these to use a single register.
17756 SDValue Src0 = Node->getOperand(Num: 1);
17757 SDValue Src1 = Node->getOperand(Num: 3);
17758 SDValue Src2 = Node->getOperand(Num: 5);
17759
17760 if ((Src0.isMachineOpcode() &&
17761 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17762 (Src0 == Src1 || Src0 == Src2))
17763 break;
17764
17765 MVT VT = Src0.getValueType().getSimpleVT();
17766 const TargetRegisterClass *RC =
17767 getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
17768
17769 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
17770 SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
17771
17772 SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node), Reg: UndefReg,
17773 N: Src0, Glue: SDValue());
17774
17775 // src0 must be the same register as src1 or src2, even if the value is
17776 // undefined, so make sure we don't violate this constraint.
17777 if (Src0.isMachineOpcode() &&
17778 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17779 if (Src1.isMachineOpcode() &&
17780 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17781 Src0 = Src1;
17782 else if (Src2.isMachineOpcode() &&
17783 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17784 Src0 = Src2;
17785 else {
17786 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17787 Src0 = UndefReg;
17788 Src1 = UndefReg;
17789 }
17790 } else
17791 break;
17792
17793 SmallVector<SDValue, 9> Ops(Node->ops());
17794 Ops[1] = Src0;
17795 Ops[3] = Src1;
17796 Ops[5] = Src2;
17797 Ops.push_back(Elt: ImpDef.getValue(R: 1));
17798 return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops);
17799 }
17800 default:
17801 break;
17802 }
17803
17804 return Node;
17805}
17806
17807// Any MIMG instructions that use tfe or lwe require an initialization of the
17808// result register that will be written in the case of a memory access failure.
17809// The required code is also added to tie this init code to the result of the
17810// img instruction.
17811void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
17812 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17813 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17814 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17815 MachineBasicBlock &MBB = *MI.getParent();
17816
17817 int DstIdx =
17818 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
17819 unsigned InitIdx = 0;
17820
17821 if (TII->isImage(MI)) {
17822 MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
17823 MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
17824 MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
17825
17826 if (!TFE && !LWE) // intersect_ray
17827 return;
17828
17829 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17830 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17831 unsigned D16Val = D16 ? D16->getImm() : 0;
17832
17833 if (!TFEVal && !LWEVal)
17834 return;
17835
17836 // At least one of TFE or LWE are non-zero
17837 // We have to insert a suitable initialization of the result value and
17838 // tie this to the dest of the image instruction.
17839
17840 // Calculate which dword we have to initialize to 0.
17841 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
17842
17843 // check that dmask operand is found.
17844 assert(MO_Dmask && "Expected dmask operand in instruction");
17845
17846 unsigned dmask = MO_Dmask->getImm();
17847 // Determine the number of active lanes taking into account the
17848 // Gather4 special case
17849 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask);
17850
17851 bool Packed = !Subtarget->hasUnpackedD16VMem();
17852
17853 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17854
17855 // Abandon attempt if the dst size isn't large enough
17856 // - this is in fact an error but this is picked up elsewhere and
17857 // reported correctly.
17858 const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
17859
17860 uint32_t DstSize = TRI.getRegSizeInBits(RC: *DstRC) / 32;
17861 if (DstSize < InitIdx)
17862 return;
17863 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
17864 const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
17865 InitIdx = TRI.getRegSizeInBits(RC: *DstRC) / 32;
17866 } else {
17867 return;
17868 }
17869
17870 const DebugLoc &DL = MI.getDebugLoc();
17871
17872 // Create a register for the initialization value.
17873 Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
17874 unsigned NewDst = 0; // Final initialized value will be in here
17875
17876 // If PRTStrictNull feature is enabled (the default) then initialize
17877 // all the result registers to 0, otherwise just the error indication
17878 // register (VGPRn+1)
17879 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17880 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17881
17882 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst);
17883 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17884 NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
17885 // Initialize dword
17886 Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
17887 // clang-format off
17888 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg)
17889 .addImm(Val: 0);
17890 // clang-format on
17891 // Insert into the super-reg
17892 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst)
17893 .addReg(RegNo: PrevDst)
17894 .addReg(RegNo: SubReg)
17895 .addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
17896
17897 PrevDst = NewDst;
17898 }
17899
17900 // Add as an implicit operand
17901 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
17902
17903 // Tie the just added implicit operand to the dst
17904 MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1);
17905}
17906
17907/// Assign the register class depending on the number of
17908/// bits set in the writemask
17909void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
17910 SDNode *Node) const {
17911 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17912
17913 MachineFunction *MF = MI.getMF();
17914 MachineRegisterInfo &MRI = MF->getRegInfo();
17915
17916 if (TII->isVOP3(Opcode: MI.getOpcode())) {
17917 // Make sure constant bus requirements are respected.
17918 TII->legalizeOperandsVOP3(MRI, MI);
17919
17920 if (TII->isMAI(MI)) {
17921 // The ordinary src0, src1, src2 were legalized above.
17922 //
17923 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17924 // as a separate instruction.
17925 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
17926 Name: AMDGPU::OpName::scale_src0);
17927 if (Src0Idx != -1) {
17928 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
17929 Name: AMDGPU::OpName::scale_src1);
17930 if (TII->usesConstantBus(MRI, MI, OpIdx: Src0Idx) &&
17931 TII->usesConstantBus(MRI, MI, OpIdx: Src1Idx))
17932 TII->legalizeOpWithMove(MI, OpIdx: Src1Idx);
17933 }
17934 }
17935
17936 return;
17937 }
17938
17939 if (TII->isImage(MI))
17940 TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr);
17941}
17942
17943static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
17944 uint64_t Val) {
17945 SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32);
17946 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), 0);
17947}
17948
17949MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
17950 const SDLoc &DL,
17951 SDValue Ptr) const {
17952 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17953
17954 // Build the half of the subregister with the constants before building the
17955 // full 128-bit register. If we are building multiple resource descriptors,
17956 // this will allow CSEing of the 2-component register.
17957 const SDValue Ops0[] = {
17958 DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32),
17959 buildSMovImm32(DAG, DL, Val: 0),
17960 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
17961 buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> 32),
17962 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
17963
17964 SDValue SubRegHi = SDValue(
17965 DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v2i32, Ops: Ops0), 0);
17966
17967 // Combine the constants and the pointer.
17968 const SDValue Ops1[] = {
17969 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), Ptr,
17970 DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32), SubRegHi,
17971 DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32)};
17972
17973 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1);
17974}
17975
17976/// Return a resource descriptor with the 'Add TID' bit enabled
17977/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17978/// of the resource descriptor) to create an offset, which is added to
17979/// the resource pointer.
17980MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
17981 SDValue Ptr, uint32_t RsrcDword1,
17982 uint64_t RsrcDword2And3) const {
17983 SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr);
17984 SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr);
17985 if (RsrcDword1) {
17986 PtrHi =
17987 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi,
17988 Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)),
17989 0);
17990 }
17991
17992 SDValue DataLo =
17993 buildSMovImm32(DAG, DL, Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17994 SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32);
17995
17996 const SDValue Ops[] = {
17997 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
17998 PtrLo,
17999 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
18000 PtrHi,
18001 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32),
18002 DataLo,
18003 DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32),
18004 DataHi,
18005 DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32)};
18006
18007 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops);
18008}
18009
18010//===----------------------------------------------------------------------===//
18011// SI Inline Assembly Support
18012//===----------------------------------------------------------------------===//
18013
18014std::pair<unsigned, const TargetRegisterClass *>
18015SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
18016 StringRef Constraint,
18017 MVT VT) const {
18018 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
18019
18020 const TargetRegisterClass *RC = nullptr;
18021 if (Constraint.size() == 1) {
18022 // Check if we cannot determine the bit size of the given value type. This
18023 // can happen, for example, in this situation where we have an empty struct
18024 // (size 0): `call void asm "", "v"({} poison)`-
18025 if (VT == MVT::Other)
18026 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18027 const unsigned BitWidth = VT.getSizeInBits();
18028 switch (Constraint[0]) {
18029 default:
18030 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18031 case 's':
18032 case 'r':
18033 switch (BitWidth) {
18034 case 16:
18035 RC = &AMDGPU::SReg_32RegClass;
18036 break;
18037 case 64:
18038 RC = &AMDGPU::SGPR_64RegClass;
18039 break;
18040 default:
18041 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
18042 if (!RC)
18043 return std::pair(0U, nullptr);
18044 break;
18045 }
18046 break;
18047 case 'v':
18048 switch (BitWidth) {
18049 case 1:
18050 return std::pair(0U, nullptr);
18051 case 16:
18052 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18053 : &AMDGPU::VGPR_32_Lo256RegClass;
18054 break;
18055 default:
18056 RC = Subtarget->has1024AddressableVGPRs()
18057 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18058 : TRI->getVGPRClassForBitWidth(BitWidth);
18059 if (!RC)
18060 return std::pair(0U, nullptr);
18061 break;
18062 }
18063 break;
18064 case 'a':
18065 if (!Subtarget->hasMAIInsts())
18066 break;
18067 switch (BitWidth) {
18068 case 1:
18069 return std::pair(0U, nullptr);
18070 case 16:
18071 RC = &AMDGPU::AGPR_32RegClass;
18072 break;
18073 default:
18074 RC = TRI->getAGPRClassForBitWidth(BitWidth);
18075 if (!RC)
18076 return std::pair(0U, nullptr);
18077 break;
18078 }
18079 break;
18080 }
18081 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18082 const unsigned BitWidth = VT.getSizeInBits();
18083 switch (BitWidth) {
18084 case 16:
18085 RC = &AMDGPU::AV_32RegClass;
18086 break;
18087 default:
18088 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
18089 if (!RC)
18090 return std::pair(0U, nullptr);
18091 break;
18092 }
18093 }
18094
18095 // We actually support i128, i16 and f16 as inline parameters
18096 // even if they are not reported as legal
18097 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
18098 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
18099 return std::pair(0U, RC);
18100
18101 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
18102 if (Kind != '\0') {
18103 if (Kind == 'v') {
18104 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18105 } else if (Kind == 's') {
18106 RC = &AMDGPU::SGPR_32RegClass;
18107 } else if (Kind == 'a') {
18108 RC = &AMDGPU::AGPR_32RegClass;
18109 }
18110
18111 if (RC) {
18112 if (NumRegs > 1) {
18113 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
18114 return std::pair(0U, nullptr);
18115
18116 uint32_t Width = NumRegs * 32;
18117 // Prohibit constraints for register ranges with a width that does not
18118 // match the required type.
18119 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
18120 return std::pair(0U, nullptr);
18121
18122 MCRegister Reg = RC->getRegister(i: Idx);
18123 if (SIRegisterInfo::isVGPRClass(RC))
18124 RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
18125 else if (SIRegisterInfo::isSGPRClass(RC))
18126 RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
18127 else if (SIRegisterInfo::isAGPRClass(RC))
18128 RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
18129 if (RC) {
18130 Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC);
18131 if (!Reg) {
18132 // The register class does not contain the requested register,
18133 // e.g., because it is an SGPR pair that would violate alignment
18134 // requirements.
18135 return std::pair(0U, nullptr);
18136 }
18137 return std::pair(Reg, RC);
18138 }
18139 }
18140
18141 // Check for lossy scalar/vector conversions.
18142 if (VT.isVector() && VT.getSizeInBits() != 32)
18143 return std::pair(0U, nullptr);
18144 if (Idx < RC->getNumRegs())
18145 return std::pair(RC->getRegister(i: Idx), RC);
18146 return std::pair(0U, nullptr);
18147 }
18148 }
18149
18150 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18151 if (Ret.first)
18152 Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first);
18153
18154 return Ret;
18155}
18156
18157static bool isImmConstraint(StringRef Constraint) {
18158 if (Constraint.size() == 1) {
18159 switch (Constraint[0]) {
18160 default:
18161 break;
18162 case 'I':
18163 case 'J':
18164 case 'A':
18165 case 'B':
18166 case 'C':
18167 return true;
18168 }
18169 } else if (Constraint == "DA" || Constraint == "DB") {
18170 return true;
18171 }
18172 return false;
18173}
18174
18175SITargetLowering::ConstraintType
18176SITargetLowering::getConstraintType(StringRef Constraint) const {
18177 if (Constraint.size() == 1) {
18178 switch (Constraint[0]) {
18179 default:
18180 break;
18181 case 's':
18182 case 'v':
18183 case 'a':
18184 return C_RegisterClass;
18185 }
18186 } else if (Constraint.size() == 2) {
18187 if (Constraint == "VA")
18188 return C_RegisterClass;
18189 }
18190 if (isImmConstraint(Constraint)) {
18191 return C_Other;
18192 }
18193 return TargetLowering::getConstraintType(Constraint);
18194}
18195
18196static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
18197 if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
18198 Val = Val & maskTrailingOnes<uint64_t>(N: Size);
18199 }
18200 return Val;
18201}
18202
18203void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
18204 StringRef Constraint,
18205 std::vector<SDValue> &Ops,
18206 SelectionDAG &DAG) const {
18207 if (isImmConstraint(Constraint)) {
18208 uint64_t Val;
18209 if (getAsmOperandConstVal(Op, Val) &&
18210 checkAsmConstraintVal(Op, Constraint, Val)) {
18211 Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
18212 Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc(Op), VT: MVT::i64));
18213 }
18214 } else {
18215 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
18216 }
18217}
18218
18219bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
18220 unsigned Size = Op.getScalarValueSizeInBits();
18221 if (Size > 64)
18222 return false;
18223
18224 if (Size == 16 && !Subtarget->has16BitInsts())
18225 return false;
18226
18227 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
18228 Val = C->getSExtValue();
18229 return true;
18230 }
18231 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
18232 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18233 return true;
18234 }
18235 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
18236 if (Size != 16 || Op.getNumOperands() != 2)
18237 return false;
18238 if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef())
18239 return false;
18240 if (ConstantSDNode *C = V->getConstantSplatNode()) {
18241 Val = C->getSExtValue();
18242 return true;
18243 }
18244 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18245 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18246 return true;
18247 }
18248 }
18249
18250 return false;
18251}
18252
18253bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
18254 uint64_t Val) const {
18255 if (Constraint.size() == 1) {
18256 switch (Constraint[0]) {
18257 case 'I':
18258 return AMDGPU::isInlinableIntLiteral(Literal: Val);
18259 case 'J':
18260 return isInt<16>(x: Val);
18261 case 'A':
18262 return checkAsmConstraintValA(Op, Val);
18263 case 'B':
18264 return isInt<32>(x: Val);
18265 case 'C':
18266 return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) ||
18267 AMDGPU::isInlinableIntLiteral(Literal: Val);
18268 default:
18269 break;
18270 }
18271 } else if (Constraint.size() == 2) {
18272 if (Constraint == "DA") {
18273 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18274 int64_t LoBits = static_cast<int32_t>(Val);
18275 return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) &&
18276 checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32);
18277 }
18278 if (Constraint == "DB") {
18279 return true;
18280 }
18281 }
18282 llvm_unreachable("Invalid asm constraint");
18283}
18284
18285bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
18286 unsigned MaxSize) const {
18287 unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
18288 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18289 if (Size == 16) {
18290 MVT VT = Op.getSimpleValueType();
18291 switch (VT.SimpleTy) {
18292 default:
18293 return false;
18294 case MVT::i16:
18295 return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
18296 case MVT::f16:
18297 return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
18298 case MVT::bf16:
18299 return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
18300 case MVT::v2i16:
18301 return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
18302 case MVT::v2f16:
18303 return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
18304 case MVT::v2bf16:
18305 return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
18306 }
18307 }
18308 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) ||
18309 (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
18310 return true;
18311 return false;
18312}
18313
18314static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18315 switch (UnalignedClassID) {
18316 case AMDGPU::VReg_64RegClassID:
18317 return AMDGPU::VReg_64_Align2RegClassID;
18318 case AMDGPU::VReg_96RegClassID:
18319 return AMDGPU::VReg_96_Align2RegClassID;
18320 case AMDGPU::VReg_128RegClassID:
18321 return AMDGPU::VReg_128_Align2RegClassID;
18322 case AMDGPU::VReg_160RegClassID:
18323 return AMDGPU::VReg_160_Align2RegClassID;
18324 case AMDGPU::VReg_192RegClassID:
18325 return AMDGPU::VReg_192_Align2RegClassID;
18326 case AMDGPU::VReg_224RegClassID:
18327 return AMDGPU::VReg_224_Align2RegClassID;
18328 case AMDGPU::VReg_256RegClassID:
18329 return AMDGPU::VReg_256_Align2RegClassID;
18330 case AMDGPU::VReg_288RegClassID:
18331 return AMDGPU::VReg_288_Align2RegClassID;
18332 case AMDGPU::VReg_320RegClassID:
18333 return AMDGPU::VReg_320_Align2RegClassID;
18334 case AMDGPU::VReg_352RegClassID:
18335 return AMDGPU::VReg_352_Align2RegClassID;
18336 case AMDGPU::VReg_384RegClassID:
18337 return AMDGPU::VReg_384_Align2RegClassID;
18338 case AMDGPU::VReg_512RegClassID:
18339 return AMDGPU::VReg_512_Align2RegClassID;
18340 case AMDGPU::VReg_1024RegClassID:
18341 return AMDGPU::VReg_1024_Align2RegClassID;
18342 case AMDGPU::AReg_64RegClassID:
18343 return AMDGPU::AReg_64_Align2RegClassID;
18344 case AMDGPU::AReg_96RegClassID:
18345 return AMDGPU::AReg_96_Align2RegClassID;
18346 case AMDGPU::AReg_128RegClassID:
18347 return AMDGPU::AReg_128_Align2RegClassID;
18348 case AMDGPU::AReg_160RegClassID:
18349 return AMDGPU::AReg_160_Align2RegClassID;
18350 case AMDGPU::AReg_192RegClassID:
18351 return AMDGPU::AReg_192_Align2RegClassID;
18352 case AMDGPU::AReg_256RegClassID:
18353 return AMDGPU::AReg_256_Align2RegClassID;
18354 case AMDGPU::AReg_512RegClassID:
18355 return AMDGPU::AReg_512_Align2RegClassID;
18356 case AMDGPU::AReg_1024RegClassID:
18357 return AMDGPU::AReg_1024_Align2RegClassID;
18358 default:
18359 return -1;
18360 }
18361}
18362
18363// Figure out which registers should be reserved for stack access. Only after
18364// the function is legalized do we know all of the non-spill stack objects or if
18365// calls are present.
18366void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
18367 MachineRegisterInfo &MRI = MF.getRegInfo();
18368 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18369 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18370 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18371 const SIInstrInfo *TII = ST.getInstrInfo();
18372
18373 if (Info->isEntryFunction()) {
18374 // Callable functions have fixed registers used for stack access.
18375 reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info);
18376 }
18377
18378 // TODO: Move this logic to getReservedRegs()
18379 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18380 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18381 Register SReg = ST.isWave32()
18382 ? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - 1)
18383 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18384 RC: &AMDGPU::SGPR_64RegClass);
18385 Info->setSGPRForEXECCopy(SReg);
18386
18387 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18388 Info->getStackPtrOffsetReg()));
18389 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18390 MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg());
18391
18392 // We need to worry about replacing the default register with itself in case
18393 // of MIR testcases missing the MFI.
18394 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18395 MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg());
18396
18397 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18398 MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg());
18399
18400 Info->limitOccupancy(MF);
18401
18402 if (ST.isWave32() && !MF.empty()) {
18403 for (auto &MBB : MF) {
18404 for (auto &MI : MBB) {
18405 TII->fixImplicitOperands(MI);
18406 }
18407 }
18408 }
18409
18410 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18411 // classes if required. Ideally the register class constraints would differ
18412 // per-subtarget, but there's no easy way to achieve that right now. This is
18413 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18414 // from using them as the register class for legal types.
18415 if (ST.needsAlignedVGPRs()) {
18416 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18417 const Register Reg = Register::index2VirtReg(Index: I);
18418 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18419 if (!RC)
18420 continue;
18421 int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
18422 if (NewClassID != -1)
18423 MRI.setRegClass(Reg, RC: TRI->getRegClass(i: NewClassID));
18424 }
18425 }
18426
18427 TargetLoweringBase::finalizeLowering(MF);
18428}
18429
18430void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
18431 KnownBits &Known,
18432 const APInt &DemandedElts,
18433 const SelectionDAG &DAG,
18434 unsigned Depth) const {
18435 Known.resetAll();
18436 unsigned Opc = Op.getOpcode();
18437 switch (Opc) {
18438 case ISD::INTRINSIC_WO_CHAIN: {
18439 unsigned IID = Op.getConstantOperandVal(i: 0);
18440 switch (IID) {
18441 case Intrinsic::amdgcn_mbcnt_lo:
18442 case Intrinsic::amdgcn_mbcnt_hi: {
18443 const GCNSubtarget &ST =
18444 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
18445 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18446 // most 31 + src1.
18447 Known.Zero.setBitsFrom(
18448 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18449 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
18450 Known = KnownBits::add(LHS: Known, RHS: Known2);
18451 return;
18452 }
18453 }
18454 break;
18455 }
18456 }
18457 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
18458 Op, Known, DemandedElts, DAG, Depth);
18459}
18460
18461void SITargetLowering::computeKnownBitsForFrameIndex(
18462 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18463 TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
18464
18465 // Set the high bits to zero based on the maximum allowed scratch size per
18466 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18467 // calculation won't overflow, so assume the sign bit is never set.
18468 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18469}
18470
18471static void knownBitsForWorkitemID(const GCNSubtarget &ST,
18472 GISelValueTracking &VT, KnownBits &Known,
18473 unsigned Dim) {
18474 unsigned MaxValue =
18475 ST.getMaxWorkitemID(Kernel: VT.getMachineFunction().getFunction(), Dimension: Dim);
18476 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
18477}
18478
18479static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,
18480 KnownBits &Known, const APInt &DemandedElts,
18481 unsigned BFEWidth, bool SExt, unsigned Depth) {
18482 const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();
18483 const MachineOperand &Src1 = MI.getOperand(i: 2);
18484
18485 unsigned Src1Cst = 0;
18486 if (Src1.isImm()) {
18487 Src1Cst = Src1.getImm();
18488 } else if (Src1.isReg()) {
18489 auto Cst = getIConstantVRegValWithLookThrough(VReg: Src1.getReg(), MRI);
18490 if (!Cst)
18491 return;
18492 Src1Cst = Cst->Value.getZExtValue();
18493 } else {
18494 return;
18495 }
18496
18497 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18498 // Width is always [22:16].
18499 const unsigned Offset =
18500 Src1Cst & maskTrailingOnes<unsigned>(N: (BFEWidth == 32) ? 5 : 6);
18501 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(N: 6);
18502
18503 if (Width >= BFEWidth) // Ill-formed.
18504 return;
18505
18506 VT.computeKnownBitsImpl(R: MI.getOperand(i: 1).getReg(), Known, DemandedElts,
18507 Depth: Depth + 1);
18508
18509 Known = Known.extractBits(NumBits: Width, BitPosition: Offset);
18510
18511 if (SExt)
18512 Known = Known.sext(BitWidth: BFEWidth);
18513 else
18514 Known = Known.zext(BitWidth: BFEWidth);
18515}
18516
18517void SITargetLowering::computeKnownBitsForTargetInstr(
18518 GISelValueTracking &VT, Register R, KnownBits &Known,
18519 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18520 unsigned Depth) const {
18521 Known.resetAll();
18522 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
18523 switch (MI->getOpcode()) {
18524 case AMDGPU::S_BFE_I32:
18525 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 32,
18526 /*SExt=*/true, Depth);
18527 case AMDGPU::S_BFE_U32:
18528 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 32,
18529 /*SExt=*/false, Depth);
18530 case AMDGPU::S_BFE_I64:
18531 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 64,
18532 /*SExt=*/true, Depth);
18533 case AMDGPU::S_BFE_U64:
18534 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 64,
18535 /*SExt=*/false, Depth);
18536 case AMDGPU::G_INTRINSIC:
18537 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18538 Intrinsic::ID IID = cast<GIntrinsic>(Val: MI)->getIntrinsicID();
18539 switch (IID) {
18540 case Intrinsic::amdgcn_workitem_id_x:
18541 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 0);
18542 break;
18543 case Intrinsic::amdgcn_workitem_id_y:
18544 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 1);
18545 break;
18546 case Intrinsic::amdgcn_workitem_id_z:
18547 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 2);
18548 break;
18549 case Intrinsic::amdgcn_mbcnt_lo:
18550 case Intrinsic::amdgcn_mbcnt_hi: {
18551 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18552 // most 31 + src1.
18553 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18554 ? getSubtarget()->getWavefrontSizeLog2()
18555 : 5);
18556 KnownBits Known2;
18557 VT.computeKnownBitsImpl(R: MI->getOperand(i: 3).getReg(), Known&: Known2, DemandedElts,
18558 Depth: Depth + 1);
18559 Known = KnownBits::add(LHS: Known, RHS: Known2);
18560 break;
18561 }
18562 case Intrinsic::amdgcn_groupstaticsize: {
18563 // We can report everything over the maximum size as 0. We can't report
18564 // based on the actual size because we don't know if it's accurate or not
18565 // at any given point.
18566 Known.Zero.setHighBits(
18567 llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize()));
18568 break;
18569 }
18570 }
18571 break;
18572 }
18573 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18574 Known.Zero.setHighBits(24);
18575 break;
18576 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18577 Known.Zero.setHighBits(16);
18578 break;
18579 case AMDGPU::G_AMDGPU_SMED3:
18580 case AMDGPU::G_AMDGPU_UMED3: {
18581 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18582
18583 KnownBits Known2;
18584 VT.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1);
18585 if (Known2.isUnknown())
18586 break;
18587
18588 KnownBits Known1;
18589 VT.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1);
18590 if (Known1.isUnknown())
18591 break;
18592
18593 KnownBits Known0;
18594 VT.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1);
18595 if (Known0.isUnknown())
18596 break;
18597
18598 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18599 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18600 Known.One = Known0.One & Known1.One & Known2.One;
18601 break;
18602 }
18603 }
18604}
18605
18606Align SITargetLowering::computeKnownAlignForTargetInstr(
18607 GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
18608 unsigned Depth) const {
18609 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
18610 if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
18611 // FIXME: Can this move to generic code? What about the case where the call
18612 // site specifies a lower alignment?
18613 Intrinsic::ID IID = GI->getIntrinsicID();
18614 LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
18615 AttributeList Attrs =
18616 Intrinsic::getAttributes(C&: Ctx, id: IID, FT: Intrinsic::getType(Context&: Ctx, id: IID));
18617 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18618 return *RetAlign;
18619 }
18620 return Align(1);
18621}
18622
18623Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
18624 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
18625 const Align CacheLineAlign = Align(64);
18626
18627 // Pre-GFX10 target did not benefit from loop alignment
18628 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18629 getSubtarget()->hasInstFwdPrefetchBug())
18630 return PrefAlign;
18631
18632 // On GFX10 I$ is 4 x 64 bytes cache lines.
18633 // By default prefetcher keeps one cache line behind and reads two ahead.
18634 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18635 // behind and one ahead.
18636 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18637 // If loop fits 64 bytes it always spans no more than two cache lines and
18638 // does not need an alignment.
18639 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18640 // Else if loop is less or equal 192 bytes we need two lines behind.
18641
18642 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18643 const MachineBasicBlock *Header = ML->getHeader();
18644 if (Header->getAlignment() != PrefAlign)
18645 return Header->getAlignment(); // Already processed.
18646
18647 unsigned LoopSize = 0;
18648 for (const MachineBasicBlock *MBB : ML->blocks()) {
18649 // If inner loop block is aligned assume in average half of the alignment
18650 // size to be added as nops.
18651 if (MBB != Header)
18652 LoopSize += MBB->getAlignment().value() / 2;
18653
18654 for (const MachineInstr &MI : *MBB) {
18655 LoopSize += TII->getInstSizeInBytes(MI);
18656 if (LoopSize > 192)
18657 return PrefAlign;
18658 }
18659 }
18660
18661 if (LoopSize <= 64)
18662 return PrefAlign;
18663
18664 if (LoopSize <= 128)
18665 return CacheLineAlign;
18666
18667 // If any of parent loops is surrounded by prefetch instructions do not
18668 // insert new for inner loop, which would reset parent's settings.
18669 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18670 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18671 auto I = Exit->getFirstNonDebugInstr();
18672 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18673 return CacheLineAlign;
18674 }
18675 }
18676
18677 MachineBasicBlock *Pre = ML->getLoopPreheader();
18678 MachineBasicBlock *Exit = ML->getExitBlock();
18679
18680 if (Pre && Exit) {
18681 auto PreTerm = Pre->getFirstTerminator();
18682 if (PreTerm == Pre->begin() ||
18683 std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18684 BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
18685 .addImm(Val: 1); // prefetch 2 lines behind PC
18686
18687 auto ExitHead = Exit->getFirstNonDebugInstr();
18688 if (ExitHead == Exit->end() ||
18689 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18690 BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
18691 .addImm(Val: 2); // prefetch 1 line behind PC
18692 }
18693
18694 return CacheLineAlign;
18695}
18696
18697[[maybe_unused]]
18698static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18699 assert(N->getOpcode() == ISD::CopyFromReg);
18700 do {
18701 // Follow the chain until we find an INLINEASM node.
18702 N = N->getOperand(Num: 0).getNode();
18703 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18704 return true;
18705 } while (N->getOpcode() == ISD::CopyFromReg);
18706 return false;
18707}
18708
18709bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
18710 FunctionLoweringInfo *FLI,
18711 UniformityInfo *UA) const {
18712 switch (N->getOpcode()) {
18713 case ISD::CopyFromReg: {
18714 const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1));
18715 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18716 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18717 Register Reg = R->getReg();
18718
18719 // FIXME: Why does this need to consider isLiveIn?
18720 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18721 return !TRI->isSGPRReg(MRI, Reg);
18722
18723 if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
18724 return UA->isDivergent(V);
18725
18726 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
18727 return !TRI->isSGPRReg(MRI, Reg);
18728 }
18729 case ISD::LOAD: {
18730 const LoadSDNode *L = cast<LoadSDNode>(Val: N);
18731 unsigned AS = L->getAddressSpace();
18732 // A flat load may access private memory.
18733 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
18734 }
18735 case ISD::CALLSEQ_END:
18736 return true;
18737 case ISD::INTRINSIC_WO_CHAIN:
18738 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0));
18739 case ISD::INTRINSIC_W_CHAIN:
18740 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1));
18741 case AMDGPUISD::ATOMIC_CMP_SWAP:
18742 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18743 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18744 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18745 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18746 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18747 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18748 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18749 case AMDGPUISD::BUFFER_ATOMIC_AND:
18750 case AMDGPUISD::BUFFER_ATOMIC_OR:
18751 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18752 case AMDGPUISD::BUFFER_ATOMIC_INC:
18753 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18754 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18755 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18756 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18757 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18758 // Target-specific read-modify-write atomics are sources of divergence.
18759 return true;
18760 default:
18761 if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
18762 // Generic read-modify-write atomics are sources of divergence.
18763 return A->readMem() && A->writeMem();
18764 }
18765 return false;
18766 }
18767}
18768
18769bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
18770 EVT VT) const {
18771 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18772 case MVT::f32:
18773 return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
18774 case MVT::f64:
18775 case MVT::f16:
18776 return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
18777 default:
18778 return false;
18779 }
18780}
18781
18782bool SITargetLowering::denormalsEnabledForType(
18783 LLT Ty, const MachineFunction &MF) const {
18784 switch (Ty.getScalarSizeInBits()) {
18785 case 32:
18786 return !denormalModeIsFlushAllF32(MF);
18787 case 64:
18788 case 16:
18789 return !denormalModeIsFlushAllF64F16(MF);
18790 default:
18791 return false;
18792 }
18793}
18794
18795bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
18796 const APInt &DemandedElts,
18797 const SelectionDAG &DAG,
18798 bool SNaN,
18799 unsigned Depth) const {
18800 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18801 const MachineFunction &MF = DAG.getMachineFunction();
18802 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18803
18804 if (Info->getMode().DX10Clamp)
18805 return true; // Clamped to 0.
18806 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
18807 }
18808
18809 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
18810 DAG, SNaN, Depth);
18811}
18812
18813// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18814// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18815static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
18816 if (RMW->hasMetadata(Kind: "amdgpu.ignore.denormal.mode"))
18817 return true;
18818
18819 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
18820 auto DenormMode = RMW->getFunction()->getDenormalMode(FPType: Flt);
18821 if (DenormMode == DenormalMode::getPreserveSign())
18822 return true;
18823
18824 // TODO: Remove this.
18825 return RMW->getFunction()
18826 ->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics")
18827 .getValueAsBool();
18828}
18829
18830static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
18831 LLVMContext &Ctx = RMW->getContext();
18832 StringRef MemScope =
18833 Ctx.getSyncScopeName(Id: RMW->getSyncScopeID()).value_or(u: "system");
18834
18835 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18836 << "Hardware instruction generated for atomic "
18837 << RMW->getOperationName(Op: RMW->getOperation())
18838 << " operation at memory scope " << MemScope;
18839}
18840
18841static bool isV2F16OrV2BF16(Type *Ty) {
18842 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
18843 Type *EltTy = VT->getElementType();
18844 return VT->getNumElements() == 2 &&
18845 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18846 }
18847
18848 return false;
18849}
18850
18851static bool isV2F16(Type *Ty) {
18852 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
18853 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18854}
18855
18856static bool isV2BF16(Type *Ty) {
18857 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
18858 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18859}
18860
18861/// \return true if atomicrmw integer ops work for the type.
18862static bool isAtomicRMWLegalIntTy(Type *Ty) {
18863 if (auto *IT = dyn_cast<IntegerType>(Val: Ty)) {
18864 unsigned BW = IT->getBitWidth();
18865 return BW == 32 || BW == 64;
18866 }
18867
18868 return false;
18869}
18870
18871/// \return true if this atomicrmw xchg type can be selected.
18872static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18873 Type *Ty = RMW->getType();
18874 if (isAtomicRMWLegalIntTy(Ty))
18875 return true;
18876
18877 if (PointerType *PT = dyn_cast<PointerType>(Val: Ty)) {
18878 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18879 unsigned BW = DL.getPointerSizeInBits(AS: PT->getAddressSpace());
18880 return BW == 32 || BW == 64;
18881 }
18882
18883 if (Ty->isFloatTy() || Ty->isDoubleTy())
18884 return true;
18885
18886 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
18887 return VT->getNumElements() == 2 &&
18888 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18889 }
18890
18891 return false;
18892}
18893
18894/// \returns true if it's valid to emit a native instruction for \p RMW, based
18895/// on the properties of the target memory.
18896static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18897 const AtomicRMWInst *RMW,
18898 bool HasSystemScope) {
18899 // The remote/fine-grained access logic is different from the integer
18900 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18901 // fine-grained access does not work, even for a device local allocation.
18902 //
18903 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18904 // allocations work.
18905 if (HasSystemScope) {
18906 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
18907 RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
18908 return true;
18909 if (Subtarget.hasEmulatedSystemScopeAtomics())
18910 return true;
18911 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
18912 return true;
18913
18914 return RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory");
18915}
18916
18917/// \return Action to perform on AtomicRMWInsts for integer operations.
18918static TargetLowering::AtomicExpansionKind
18919atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
18920 return isAtomicRMWLegalIntTy(Ty: RMW->getType())
18921 ? TargetLowering::AtomicExpansionKind::None
18922 : TargetLowering::AtomicExpansionKind::CmpXChg;
18923}
18924
18925/// Return if a flat address space atomicrmw can access private memory.
18926static bool flatInstrMayAccessPrivate(const Instruction *I) {
18927 const MDNode *MD = I->getMetadata(KindID: LLVMContext::MD_noalias_addrspace);
18928 return !MD ||
18929 !AMDGPU::hasValueInRangeLikeMetadata(MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
18930}
18931
18932static TargetLowering::AtomicExpansionKind
18933getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
18934 // For GAS, lower to flat atomic.
18935 return STI.hasGloballyAddressableScratch()
18936 ? TargetLowering::AtomicExpansionKind::CustomExpand
18937 : TargetLowering::AtomicExpansionKind::NotAtomic;
18938}
18939
18940TargetLowering::AtomicExpansionKind
18941SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
18942 unsigned AS = RMW->getPointerAddressSpace();
18943 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18944 return getPrivateAtomicExpansionKind(STI: *getSubtarget());
18945
18946 // 64-bit flat atomics that dynamically reside in private memory will silently
18947 // be dropped.
18948 //
18949 // Note that we will emit a new copy of the original atomic in the expansion,
18950 // which will be incrementally relegalized.
18951 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18952 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18953 DL.getTypeSizeInBits(Ty: RMW->getType()) == 64 &&
18954 flatInstrMayAccessPrivate(I: RMW))
18955 return AtomicExpansionKind::CustomExpand;
18956
18957 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18958 OptimizationRemarkEmitter ORE(RMW->getFunction());
18959 ORE.emit(RemarkBuilder: [=]() {
18960 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18961 });
18962 return Kind;
18963 };
18964
18965 auto SSID = RMW->getSyncScopeID();
18966 bool HasSystemScope =
18967 SSID == SyncScope::System ||
18968 SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
18969
18970 auto Op = RMW->getOperation();
18971 switch (Op) {
18972 case AtomicRMWInst::Xchg:
18973 // PCIe supports add and xchg for system atomics.
18974 return isAtomicRMWLegalXChgTy(RMW)
18975 ? TargetLowering::AtomicExpansionKind::None
18976 : TargetLowering::AtomicExpansionKind::CmpXChg;
18977 case AtomicRMWInst::Add:
18978 // PCIe supports add and xchg for system atomics.
18979 return atomicSupportedIfLegalIntType(RMW);
18980 case AtomicRMWInst::Sub:
18981 case AtomicRMWInst::And:
18982 case AtomicRMWInst::Or:
18983 case AtomicRMWInst::Xor:
18984 case AtomicRMWInst::Max:
18985 case AtomicRMWInst::Min:
18986 case AtomicRMWInst::UMax:
18987 case AtomicRMWInst::UMin:
18988 case AtomicRMWInst::UIncWrap:
18989 case AtomicRMWInst::UDecWrap:
18990 case AtomicRMWInst::USubCond:
18991 case AtomicRMWInst::USubSat: {
18992 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
18993 return AtomicExpansionKind::CmpXChg;
18994 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
18995 return AtomicExpansionKind::CmpXChg;
18996 if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) {
18997 auto *IT = dyn_cast<IntegerType>(Val: RMW->getType());
18998 if (!IT || IT->getBitWidth() != 32)
18999 return AtomicExpansionKind::CmpXChg;
19000 }
19001
19002 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
19003 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19004 if (Subtarget->hasEmulatedSystemScopeAtomics())
19005 return atomicSupportedIfLegalIntType(RMW);
19006
19007 // On most subtargets, for atomicrmw operations other than add/xchg,
19008 // whether or not the instructions will behave correctly depends on where
19009 // the address physically resides and what interconnect is used in the
19010 // system configuration. On some some targets the instruction will nop,
19011 // and in others synchronization will only occur at degraded device scope.
19012 //
19013 // If the allocation is known local to the device, the instructions should
19014 // work correctly.
19015 if (RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
19016 return atomicSupportedIfLegalIntType(RMW);
19017
19018 // If fine-grained remote memory works at device scope, we don't need to
19019 // do anything.
19020 if (!HasSystemScope &&
19021 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19022 return atomicSupportedIfLegalIntType(RMW);
19023
19024 // If we are targeting a remote allocated address, it depends what kind of
19025 // allocation the address belongs to.
19026 //
19027 // If the allocation is fine-grained (in host memory, or in PCIe peer
19028 // device memory), the operation will fail depending on the target.
19029 //
19030 // Note fine-grained host memory access does work on APUs or if XGMI is
19031 // used, but we do not know if we are targeting an APU or the system
19032 // configuration from the ISA version/target-cpu.
19033 if (RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory"))
19034 return atomicSupportedIfLegalIntType(RMW);
19035
19036 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
19037 Op == AtomicRMWInst::Xor) {
19038 // Atomic sub/or/xor do not work over PCI express, but atomic add
19039 // does. InstCombine transforms these with 0 to or, so undo that.
19040 if (const Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
19041 ConstVal && ConstVal->isNullValue())
19042 return AtomicExpansionKind::CustomExpand;
19043 }
19044
19045 // If the allocation could be in remote, fine-grained memory, the rmw
19046 // instructions may fail. cmpxchg should work, so emit that. On some
19047 // system configurations, PCIe atomics aren't supported so cmpxchg won't
19048 // even work, so you're out of luck anyway.
19049
19050 // In summary:
19051 //
19052 // Cases that may fail:
19053 // - fine-grained pinned host memory
19054 // - fine-grained migratable host memory
19055 // - fine-grained PCIe peer device
19056 //
19057 // Cases that should work, but may be treated overly conservatively.
19058 // - fine-grained host memory on an APU
19059 // - fine-grained XGMI peer device
19060 return AtomicExpansionKind::CmpXChg;
19061 }
19062
19063 return atomicSupportedIfLegalIntType(RMW);
19064 }
19065 case AtomicRMWInst::FAdd: {
19066 Type *Ty = RMW->getType();
19067
19068 // TODO: Handle REGION_ADDRESS
19069 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19070 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
19071 // is fixed to round-to-nearest-even.
19072 //
19073 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
19074 // round-to-nearest-even.
19075 //
19076 // We ignore the rounding mode problem, even in strictfp. The C++ standard
19077 // suggests it is OK if the floating-point mode may not match the calling
19078 // thread.
19079 if (Ty->isFloatTy()) {
19080 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
19081 : AtomicExpansionKind::CmpXChg;
19082 }
19083
19084 if (Ty->isDoubleTy()) {
19085 // Ignores denormal mode, but we don't consider flushing mandatory.
19086 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
19087 : AtomicExpansionKind::CmpXChg;
19088 }
19089
19090 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19091 return AtomicExpansionKind::None;
19092
19093 return AtomicExpansionKind::CmpXChg;
19094 }
19095
19096 // LDS atomics respect the denormal mode from the mode register.
19097 //
19098 // Traditionally f32 global/buffer memory atomics would unconditionally
19099 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
19100 // flush.
19101 //
19102 // On targets with flat atomic fadd, denormals would flush depending on
19103 // whether the target address resides in LDS or global memory. We consider
19104 // this flat-maybe-flush as will-flush.
19105 if (Ty->isFloatTy() &&
19106 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19107 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
19108 return AtomicExpansionKind::CmpXChg;
19109
19110 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
19111 // safe. The message phrasing also should be better.
19112 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
19113 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19114 // gfx942, gfx12
19115 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19116 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19117 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
19118 // gfx90a, gfx942, gfx12
19119 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19120 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19121
19122 // gfx942, gfx12
19123 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
19124 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19125 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19126 // gfx90a, gfx942, gfx12
19127 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19128 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19129
19130 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
19131 // buffer. gfx12 does have the buffer version.
19132 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
19133 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19134 }
19135
19136 // global and flat atomic fadd f64: gfx90a, gfx942.
19137 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19138 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19139
19140 if (AS != AMDGPUAS::FLAT_ADDRESS) {
19141 if (Ty->isFloatTy()) {
19142 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
19143 // gfx11+.
19144 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19145 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19146 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
19147 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19148 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19149 } else {
19150 // gfx908
19151 if (RMW->use_empty() &&
19152 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19153 isV2F16(Ty))
19154 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19155 }
19156 }
19157
19158 // flat atomic fadd f32: gfx942, gfx11+.
19159 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
19160 if (Subtarget->hasFlatAtomicFaddF32Inst())
19161 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19162
19163 // If it is in flat address space, and the type is float, we will try to
19164 // expand it, if the target supports global and lds atomic fadd. The
19165 // reason we need that is, in the expansion, we emit the check of
19166 // address space. If it is in global address space, we emit the global
19167 // atomic fadd; if it is in shared address space, we emit the LDS atomic
19168 // fadd.
19169 if (Subtarget->hasLDSFPAtomicAddF32()) {
19170 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19171 return AtomicExpansionKind::CustomExpand;
19172 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19173 return AtomicExpansionKind::CustomExpand;
19174 }
19175 }
19176 }
19177
19178 return AtomicExpansionKind::CmpXChg;
19179 }
19180 case AtomicRMWInst::FMin:
19181 case AtomicRMWInst::FMax: {
19182 Type *Ty = RMW->getType();
19183
19184 // LDS float and double fmin/fmax were always supported.
19185 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19186 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
19187 : AtomicExpansionKind::CmpXChg;
19188 }
19189
19190 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
19191 // For flat and global cases:
19192 // float, double in gfx7. Manual claims denormal support.
19193 // Removed in gfx8.
19194 // float, double restored in gfx10.
19195 // double removed again in gfx11, so only f32 for gfx11/gfx12.
19196 //
19197 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
19198 // no f32.
19199 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19200 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19201 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19202 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19203 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19204 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
19205 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19206 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19207 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19208 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19209 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19210 }
19211 }
19212
19213 return AtomicExpansionKind::CmpXChg;
19214 }
19215 case AtomicRMWInst::Nand:
19216 case AtomicRMWInst::FSub:
19217 default:
19218 return AtomicExpansionKind::CmpXChg;
19219 }
19220
19221 llvm_unreachable("covered atomicrmw op switch");
19222}
19223
19224TargetLowering::AtomicExpansionKind
19225SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19226 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
19227 ? getPrivateAtomicExpansionKind(STI: *getSubtarget())
19228 : AtomicExpansionKind::None;
19229}
19230
19231TargetLowering::AtomicExpansionKind
19232SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19233 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
19234 ? getPrivateAtomicExpansionKind(STI: *getSubtarget())
19235 : AtomicExpansionKind::None;
19236}
19237
19238TargetLowering::AtomicExpansionKind
19239SITargetLowering::shouldExpandAtomicCmpXchgInIR(
19240 const AtomicCmpXchgInst *CmpX) const {
19241 unsigned AddrSpace = CmpX->getPointerAddressSpace();
19242 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19243 return getPrivateAtomicExpansionKind(STI: *getSubtarget());
19244
19245 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(I: CmpX))
19246 return AtomicExpansionKind::None;
19247
19248 const DataLayout &DL = CmpX->getDataLayout();
19249
19250 Type *ValTy = CmpX->getNewValOperand()->getType();
19251
19252 // If a 64-bit flat atomic may alias private, we need to avoid using the
19253 // atomic in the private case.
19254 return DL.getTypeSizeInBits(Ty: ValTy) == 64 ? AtomicExpansionKind::CustomExpand
19255 : AtomicExpansionKind::None;
19256}
19257
19258const TargetRegisterClass *
19259SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19260 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false);
19261 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19262 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19263 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19264 : &AMDGPU::SReg_32RegClass;
19265 if (!TRI->isSGPRClass(RC) && !isDivergent)
19266 return TRI->getEquivalentSGPRClass(VRC: RC);
19267 if (TRI->isSGPRClass(RC) && isDivergent) {
19268 if (Subtarget->hasGFX90AInsts())
19269 return TRI->getEquivalentAVClass(SRC: RC);
19270 return TRI->getEquivalentVGPRClass(SRC: RC);
19271 }
19272
19273 return RC;
19274}
19275
19276// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19277// uniform values (as produced by the mask results of control flow intrinsics)
19278// used outside of divergent blocks. The phi users need to also be treated as
19279// always uniform.
19280//
19281// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19282static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19283 unsigned WaveSize) {
19284 // FIXME: We assume we never cast the mask results of a control flow
19285 // intrinsic.
19286 // Early exit if the type won't be consistent as a compile time hack.
19287 IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
19288 if (!IT || IT->getBitWidth() != WaveSize)
19289 return false;
19290
19291 if (!isa<Instruction>(Val: V))
19292 return false;
19293 if (!Visited.insert(Ptr: V).second)
19294 return false;
19295 bool Result = false;
19296 for (const auto *U : V->users()) {
19297 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
19298 if (V == U->getOperand(i: 1)) {
19299 switch (Intrinsic->getIntrinsicID()) {
19300 default:
19301 Result = false;
19302 break;
19303 case Intrinsic::amdgcn_if_break:
19304 case Intrinsic::amdgcn_if:
19305 case Intrinsic::amdgcn_else:
19306 Result = true;
19307 break;
19308 }
19309 }
19310 if (V == U->getOperand(i: 0)) {
19311 switch (Intrinsic->getIntrinsicID()) {
19312 default:
19313 Result = false;
19314 break;
19315 case Intrinsic::amdgcn_end_cf:
19316 case Intrinsic::amdgcn_loop:
19317 Result = true;
19318 break;
19319 }
19320 }
19321 } else {
19322 Result = hasCFUser(V: U, Visited, WaveSize);
19323 }
19324 if (Result)
19325 break;
19326 }
19327 return Result;
19328}
19329
19330bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
19331 const Value *V) const {
19332 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
19333 if (CI->isInlineAsm()) {
19334 // FIXME: This cannot give a correct answer. This should only trigger in
19335 // the case where inline asm returns mixed SGPR and VGPR results, used
19336 // outside the defining block. We don't have a specific result to
19337 // consider, so this assumes if any value is SGPR, the overall register
19338 // also needs to be SGPR.
19339 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19340 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
19341 DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI);
19342 for (auto &TC : TargetConstraints) {
19343 if (TC.Type == InlineAsm::isOutput) {
19344 ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
19345 const TargetRegisterClass *RC =
19346 getRegForInlineAsmConstraint(TRI_: SIRI, Constraint: TC.ConstraintCode,
19347 VT: TC.ConstraintVT)
19348 .second;
19349 if (RC && SIRI->isSGPRClass(RC))
19350 return true;
19351 }
19352 }
19353 }
19354 }
19355 SmallPtrSet<const Value *, 16> Visited;
19356 return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize());
19357}
19358
19359bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
19360 for (SDUse &Use : N->uses()) {
19361 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: Use.getUser())) {
19362 if (getBasePtrIndex(N: M) == Use.getOperandNo())
19363 return true;
19364 }
19365 }
19366 return false;
19367}
19368
19369bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
19370 SDValue N1) const {
19371 if (!N0.hasOneUse())
19372 return false;
19373 // Take care of the opportunity to keep N0 uniform
19374 if (N0->isDivergent() || !N1->isDivergent())
19375 return true;
19376 // Check if we have a good chance to form the memory access pattern with the
19377 // base and offset
19378 return (DAG.isBaseWithConstantOffset(Op: N0) &&
19379 hasMemSDNodeUser(N: *N0->user_begin()));
19380}
19381
19382bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
19383 Register N0, Register N1) const {
19384 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
19385}
19386
19387MachineMemOperand::Flags
19388SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
19389 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19390 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
19391 if (I.getMetadata(Kind: "amdgpu.noclobber"))
19392 Flags |= MONoClobber;
19393 if (I.getMetadata(Kind: "amdgpu.last.use"))
19394 Flags |= MOLastUse;
19395 return Flags;
19396}
19397
19398void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
19399 Instruction *AI) const {
19400 // Given: atomicrmw fadd ptr %addr, float %val ordering
19401 //
19402 // With this expansion we produce the following code:
19403 // [...]
19404 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19405 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19406 //
19407 // atomicrmw.shared:
19408 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19409 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19410 // float %val ordering
19411 // br label %atomicrmw.phi
19412 //
19413 // atomicrmw.check.private:
19414 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19415 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19416 //
19417 // atomicrmw.private:
19418 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19419 // %loaded.private = load float, ptr addrspace(5) %cast.private
19420 // %val.new = fadd float %loaded.private, %val
19421 // store float %val.new, ptr addrspace(5) %cast.private
19422 // br label %atomicrmw.phi
19423 //
19424 // atomicrmw.global:
19425 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19426 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19427 // float %val ordering
19428 // br label %atomicrmw.phi
19429 //
19430 // atomicrmw.phi:
19431 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19432 // [ %loaded.private, %atomicrmw.private ],
19433 // [ %loaded.global, %atomicrmw.global ]
19434 // br label %atomicrmw.end
19435 //
19436 // atomicrmw.end:
19437 // [...]
19438 //
19439 //
19440 // For 64-bit atomics which may reside in private memory, we perform a simpler
19441 // version that only inserts the private check, and uses the flat operation.
19442
19443 IRBuilder<> Builder(AI);
19444 LLVMContext &Ctx = Builder.getContext();
19445
19446 auto *RMW = dyn_cast<AtomicRMWInst>(Val: AI);
19447 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19448 : AtomicCmpXchgInst::getPointerOperandIndex();
19449 Value *Addr = AI->getOperand(i: PtrOpIdx);
19450
19451 /// TODO: Only need to check private, then emit flat-known-not private (no
19452 /// need for shared block, or cast to global).
19453 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(Val: AI);
19454
19455 Align Alignment;
19456 if (RMW)
19457 Alignment = RMW->getAlign();
19458 else if (CX)
19459 Alignment = CX->getAlign();
19460 else
19461 llvm_unreachable("unhandled atomic operation");
19462
19463 // FullFlatEmulation is true if we need to issue the private, shared, and
19464 // global cases.
19465 //
19466 // If this is false, we are only dealing with the flat-targeting-private case,
19467 // where we only insert a check for private and still use the flat instruction
19468 // for global and shared.
19469
19470 bool FullFlatEmulation =
19471 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19472 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19473 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19474 RMW->getType()->isDoubleTy()));
19475
19476 // If the return value isn't used, do not introduce a false use in the phi.
19477 bool ReturnValueIsUsed = !AI->use_empty();
19478
19479 BasicBlock *BB = Builder.GetInsertBlock();
19480 Function *F = BB->getParent();
19481 BasicBlock *ExitBB =
19482 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
19483 BasicBlock *SharedBB = nullptr;
19484
19485 BasicBlock *CheckPrivateBB = BB;
19486 if (FullFlatEmulation) {
19487 SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
19488 CheckPrivateBB =
19489 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
19490 }
19491
19492 BasicBlock *PrivateBB =
19493 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
19494 BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
19495 BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
19496
19497 std::prev(x: BB->end())->eraseFromParent();
19498 Builder.SetInsertPoint(BB);
19499
19500 Value *LoadedShared = nullptr;
19501 if (FullFlatEmulation) {
19502 CallInst *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared,
19503 Args: {Addr}, FMFSource: nullptr, Name: "is.shared");
19504 Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
19505 Builder.SetInsertPoint(SharedBB);
19506 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19507 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
19508
19509 Instruction *Clone = AI->clone();
19510 Clone->insertInto(ParentBB: SharedBB, It: SharedBB->end());
19511 Clone->getOperandUse(i: PtrOpIdx).set(CastToLocal);
19512 LoadedShared = Clone;
19513
19514 Builder.CreateBr(Dest: PhiBB);
19515 Builder.SetInsertPoint(CheckPrivateBB);
19516 }
19517
19518 CallInst *IsPrivate = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_private,
19519 Args: {Addr}, FMFSource: nullptr, Name: "is.private");
19520 Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
19521
19522 Builder.SetInsertPoint(PrivateBB);
19523
19524 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19525 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
19526
19527 Value *LoadedPrivate;
19528 if (RMW) {
19529 LoadedPrivate = Builder.CreateAlignedLoad(
19530 Ty: RMW->getType(), Ptr: CastToPrivate, Align: RMW->getAlign(), Name: "loaded.private");
19531
19532 Value *NewVal = buildAtomicRMWValue(Op: RMW->getOperation(), Builder,
19533 Loaded: LoadedPrivate, Val: RMW->getValOperand());
19534
19535 Builder.CreateAlignedStore(Val: NewVal, Ptr: CastToPrivate, Align: RMW->getAlign());
19536 } else {
19537 auto [ResultLoad, Equal] =
19538 buildCmpXchgValue(Builder, Ptr: CastToPrivate, Cmp: CX->getCompareOperand(),
19539 Val: CX->getNewValOperand(), Alignment: CX->getAlign());
19540
19541 Value *Insert = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CX->getType()),
19542 Val: ResultLoad, Idxs: 0);
19543 LoadedPrivate = Builder.CreateInsertValue(Agg: Insert, Val: Equal, Idxs: 1);
19544 }
19545
19546 Builder.CreateBr(Dest: PhiBB);
19547
19548 Builder.SetInsertPoint(GlobalBB);
19549
19550 // Continue using a flat instruction if we only emitted the check for private.
19551 Instruction *LoadedGlobal = AI;
19552 if (FullFlatEmulation) {
19553 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19554 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
19555 AI->getOperandUse(i: PtrOpIdx).set(CastToGlobal);
19556 }
19557
19558 AI->removeFromParent();
19559 AI->insertInto(ParentBB: GlobalBB, It: GlobalBB->end());
19560
19561 // The new atomicrmw may go through another round of legalization later.
19562 if (!FullFlatEmulation) {
19563 // We inserted the runtime check already, make sure we do not try to
19564 // re-expand this.
19565 // TODO: Should union with any existing metadata.
19566 MDBuilder MDB(F->getContext());
19567 MDNode *RangeNotPrivate =
19568 MDB.createRange(Lo: APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
19569 Hi: APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
19570 LoadedGlobal->setMetadata(KindID: LLVMContext::MD_noalias_addrspace,
19571 Node: RangeNotPrivate);
19572 }
19573
19574 Builder.CreateBr(Dest: PhiBB);
19575
19576 Builder.SetInsertPoint(PhiBB);
19577
19578 if (ReturnValueIsUsed) {
19579 PHINode *Loaded = Builder.CreatePHI(Ty: AI->getType(), NumReservedValues: 3);
19580 AI->replaceAllUsesWith(V: Loaded);
19581 if (FullFlatEmulation)
19582 Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
19583 Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
19584 Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
19585 Loaded->takeName(V: AI);
19586 }
19587
19588 Builder.CreateBr(Dest: ExitBB);
19589}
19590
19591static void convertScratchAtomicToFlatAtomic(Instruction *I,
19592 unsigned PtrOpIdx) {
19593 Value *PtrOp = I->getOperand(i: PtrOpIdx);
19594 assert(PtrOp->getType()->getPointerAddressSpace() ==
19595 AMDGPUAS::PRIVATE_ADDRESS);
19596
19597 Type *FlatPtr = PointerType::get(C&: I->getContext(), AddressSpace: AMDGPUAS::FLAT_ADDRESS);
19598 Value *ASCast = CastInst::CreatePointerCast(S: PtrOp, Ty: FlatPtr, Name: "scratch.ascast",
19599 InsertBefore: I->getIterator());
19600 I->setOperand(i: PtrOpIdx, Val: ASCast);
19601}
19602
19603void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
19604 AtomicRMWInst::BinOp Op = AI->getOperation();
19605
19606 if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19607 return convertScratchAtomicToFlatAtomic(I: AI, PtrOpIdx: AI->getPointerOperandIndex());
19608
19609 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
19610 Op == AtomicRMWInst::Xor) {
19611 if (const auto *ConstVal = dyn_cast<Constant>(Val: AI->getValOperand());
19612 ConstVal && ConstVal->isNullValue()) {
19613 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19614 AI->setOperation(AtomicRMWInst::Add);
19615
19616 // We may still need the private-alias-flat handling below.
19617
19618 // TODO: Skip this for cases where we cannot access remote memory.
19619 }
19620 }
19621
19622 // The non-flat expansions should only perform the de-canonicalization of
19623 // identity values.
19624 if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
19625 return;
19626
19627 emitExpandAtomicAddrSpacePredicate(AI);
19628}
19629
19630void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
19631 if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19632 return convertScratchAtomicToFlatAtomic(I: CI, PtrOpIdx: CI->getPointerOperandIndex());
19633
19634 emitExpandAtomicAddrSpacePredicate(AI: CI);
19635}
19636
19637void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
19638 if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19639 return convertScratchAtomicToFlatAtomic(I: LI, PtrOpIdx: LI->getPointerOperandIndex());
19640
19641 llvm_unreachable(
19642 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19643}
19644
19645void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
19646 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19647 return convertScratchAtomicToFlatAtomic(I: SI, PtrOpIdx: SI->getPointerOperandIndex());
19648
19649 llvm_unreachable(
19650 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19651}
19652
19653LoadInst *
19654SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19655 IRBuilder<> Builder(AI);
19656 auto Order = AI->getOrdering();
19657
19658 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19659 // must be flushed if the atomic ordering had a release semantics. This is
19660 // not necessary a fence, a release fence just coincides to do that flush.
19661 // Avoid replacing of an atomicrmw with a release semantics.
19662 if (isReleaseOrStronger(AO: Order))
19663 return nullptr;
19664
19665 LoadInst *LI = Builder.CreateAlignedLoad(
19666 Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
19667 LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
19668 LI->copyMetadata(SrcInst: *AI);
19669 LI->takeName(V: AI);
19670 AI->replaceAllUsesWith(V: LI);
19671 AI->eraseFromParent();
19672 return LI;
19673}
19674