1 | //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Custom DAG lowering for SI |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "SIISelLowering.h" |
15 | #include "AMDGPU.h" |
16 | #include "AMDGPUInstrInfo.h" |
17 | #include "AMDGPUTargetMachine.h" |
18 | #include "GCNSubtarget.h" |
19 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
20 | #include "SIMachineFunctionInfo.h" |
21 | #include "SIRegisterInfo.h" |
22 | #include "llvm/ADT/APInt.h" |
23 | #include "llvm/ADT/FloatingPointMode.h" |
24 | #include "llvm/ADT/Statistic.h" |
25 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
26 | #include "llvm/Analysis/UniformityAnalysis.h" |
27 | #include "llvm/BinaryFormat/ELF.h" |
28 | #include "llvm/CodeGen/Analysis.h" |
29 | #include "llvm/CodeGen/ByteProvider.h" |
30 | #include "llvm/CodeGen/FunctionLoweringInfo.h" |
31 | #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" |
32 | #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" |
33 | #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" |
34 | #include "llvm/CodeGen/MachineFrameInfo.h" |
35 | #include "llvm/CodeGen/MachineFunction.h" |
36 | #include "llvm/CodeGen/MachineLoopInfo.h" |
37 | #include "llvm/IR/DiagnosticInfo.h" |
38 | #include "llvm/IR/IRBuilder.h" |
39 | #include "llvm/IR/IntrinsicInst.h" |
40 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
41 | #include "llvm/IR/IntrinsicsR600.h" |
42 | #include "llvm/Support/CommandLine.h" |
43 | #include "llvm/Support/KnownBits.h" |
44 | #include "llvm/Support/ModRef.h" |
45 | #include <optional> |
46 | |
47 | using namespace llvm; |
48 | |
49 | #define DEBUG_TYPE "si-lower" |
50 | |
51 | STATISTIC(NumTailCalls, "Number of tail calls" ); |
52 | |
53 | static cl::opt<bool> DisableLoopAlignment( |
54 | "amdgpu-disable-loop-alignment" , |
55 | cl::desc("Do not align and prefetch loops" ), |
56 | cl::init(Val: false)); |
57 | |
58 | static cl::opt<bool> UseDivergentRegisterIndexing( |
59 | "amdgpu-use-divergent-register-indexing" , |
60 | cl::Hidden, |
61 | cl::desc("Use indirect register addressing for divergent indexes" ), |
62 | cl::init(Val: false)); |
63 | |
64 | static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { |
65 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
66 | return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); |
67 | } |
68 | |
69 | static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) { |
70 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
71 | return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign(); |
72 | } |
73 | |
74 | static unsigned findFirstFreeSGPR(CCState &CCInfo) { |
75 | unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); |
76 | for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { |
77 | if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) { |
78 | return AMDGPU::SGPR0 + Reg; |
79 | } |
80 | } |
81 | llvm_unreachable("Cannot allocate sgpr" ); |
82 | } |
83 | |
84 | SITargetLowering::SITargetLowering(const TargetMachine &TM, |
85 | const GCNSubtarget &STI) |
86 | : AMDGPUTargetLowering(TM, STI), |
87 | Subtarget(&STI) { |
88 | addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass); |
89 | addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass); |
90 | |
91 | addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass); |
92 | addRegisterClass(VT: MVT::f32, RC: &AMDGPU::VGPR_32RegClass); |
93 | |
94 | addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass); |
95 | |
96 | const SIRegisterInfo *TRI = STI.getRegisterInfo(); |
97 | const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); |
98 | |
99 | addRegisterClass(VT: MVT::f64, RC: V64RegClass); |
100 | addRegisterClass(VT: MVT::v2f32, RC: V64RegClass); |
101 | addRegisterClass(VT: MVT::Untyped, RC: V64RegClass); |
102 | |
103 | addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass); |
104 | addRegisterClass(VT: MVT::v3f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 96)); |
105 | |
106 | addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass); |
107 | addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass); |
108 | |
109 | addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass); |
110 | addRegisterClass(VT: MVT::v4f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 128)); |
111 | |
112 | addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass); |
113 | addRegisterClass(VT: MVT::v5f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 160)); |
114 | |
115 | addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass); |
116 | addRegisterClass(VT: MVT::v6f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192)); |
117 | |
118 | addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass); |
119 | addRegisterClass(VT: MVT::v3f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192)); |
120 | |
121 | addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass); |
122 | addRegisterClass(VT: MVT::v7f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 224)); |
123 | |
124 | addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass); |
125 | addRegisterClass(VT: MVT::v8f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 256)); |
126 | |
127 | addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass); |
128 | addRegisterClass(VT: MVT::v4f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 256)); |
129 | |
130 | addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass); |
131 | addRegisterClass(VT: MVT::v9f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 288)); |
132 | |
133 | addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass); |
134 | addRegisterClass(VT: MVT::v10f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 320)); |
135 | |
136 | addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass); |
137 | addRegisterClass(VT: MVT::v11f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 352)); |
138 | |
139 | addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass); |
140 | addRegisterClass(VT: MVT::v12f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 384)); |
141 | |
142 | addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass); |
143 | addRegisterClass(VT: MVT::v16f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 512)); |
144 | |
145 | addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass); |
146 | addRegisterClass(VT: MVT::v8f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 512)); |
147 | |
148 | addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass); |
149 | addRegisterClass(VT: MVT::v16f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 1024)); |
150 | |
151 | if (Subtarget->has16BitInsts()) { |
152 | if (Subtarget->useRealTrue16Insts()) { |
153 | addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass); |
154 | addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass); |
155 | addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass); |
156 | } else { |
157 | addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass); |
158 | addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass); |
159 | addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass); |
160 | } |
161 | |
162 | // Unless there are also VOP3P operations, not operations are really legal. |
163 | addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass); |
164 | addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass); |
165 | addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass); |
166 | addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass); |
167 | addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass); |
168 | addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass); |
169 | addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass); |
170 | addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass); |
171 | addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass); |
172 | addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass); |
173 | addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass); |
174 | addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass); |
175 | addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass); |
176 | addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass); |
177 | addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass); |
178 | } |
179 | |
180 | addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass); |
181 | addRegisterClass(VT: MVT::v32f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 1024)); |
182 | |
183 | computeRegisterProperties(TRI: Subtarget->getRegisterInfo()); |
184 | |
185 | // The boolean content concept here is too inflexible. Compares only ever |
186 | // really produce a 1-bit result. Any copy/extend from these will turn into a |
187 | // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as |
188 | // it's what most targets use. |
189 | setBooleanContents(ZeroOrOneBooleanContent); |
190 | setBooleanVectorContents(ZeroOrOneBooleanContent); |
191 | |
192 | // We need to custom lower vector stores from local memory |
193 | setOperationAction(Ops: ISD::LOAD, |
194 | VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, |
195 | MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, |
196 | MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, |
197 | MVT::i1, MVT::v32i32}, |
198 | Action: Custom); |
199 | |
200 | setOperationAction(Ops: ISD::STORE, |
201 | VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, |
202 | MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, |
203 | MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32, |
204 | MVT::i1, MVT::v32i32}, |
205 | Action: Custom); |
206 | |
207 | if (isTypeLegal(VT: MVT::bf16)) { |
208 | for (unsigned Opc : |
209 | {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, |
210 | ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM, |
211 | ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT, |
212 | ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI, |
213 | ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2, |
214 | ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, |
215 | ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, |
216 | ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE, |
217 | ISD::SETCC}) { |
218 | // FIXME: The promoted to type shouldn't need to be explicit |
219 | setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote); |
220 | AddPromotedToType(Opc, OrigVT: MVT::bf16, DestVT: MVT::f32); |
221 | } |
222 | |
223 | setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand); |
224 | |
225 | setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote); |
226 | AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16); |
227 | |
228 | setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal); |
229 | setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal); |
230 | setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal); |
231 | |
232 | // We only need to custom lower because we can't specify an action for bf16 |
233 | // sources. |
234 | setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom); |
235 | setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom); |
236 | } |
237 | |
238 | setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand); |
239 | setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand); |
240 | setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand); |
241 | setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand); |
242 | setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand); |
243 | setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand); |
244 | setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand); |
245 | setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand); |
246 | setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand); |
247 | setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand); |
248 | setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand); |
249 | setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand); |
250 | setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand); |
251 | setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand); |
252 | setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand); |
253 | setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand); |
254 | |
255 | setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand); |
256 | setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand); |
257 | setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand); |
258 | setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand); |
259 | setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand); |
260 | setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand); |
261 | setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand); |
262 | |
263 | setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom); |
264 | |
265 | setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote); |
266 | setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom); |
267 | setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote); |
268 | AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64); |
269 | |
270 | setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom); |
271 | |
272 | setOperationAction(Ops: ISD::SELECT_CC, |
273 | VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand); |
274 | |
275 | setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote); |
276 | setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand); |
277 | AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32); |
278 | |
279 | setOperationAction(Ops: ISD::TRUNCATE, |
280 | VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, |
281 | MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32, |
282 | MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32}, |
283 | Action: Expand); |
284 | setOperationAction(Ops: ISD::FP_ROUND, |
285 | VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, |
286 | MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32, |
287 | MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32}, |
288 | Action: Expand); |
289 | |
290 | setOperationAction(Ops: ISD::SIGN_EXTEND_INREG, |
291 | VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16, |
292 | MVT::v3i16, MVT::v4i16, MVT::Other}, |
293 | Action: Custom); |
294 | |
295 | setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom); |
296 | setOperationAction(Ops: ISD::BR_CC, |
297 | VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand); |
298 | |
299 | setOperationAction(Ops: {ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal); |
300 | |
301 | setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal); |
302 | |
303 | setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64, |
304 | Action: Expand); |
305 | |
306 | #if 0 |
307 | setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal); |
308 | #endif |
309 | |
310 | // We only support LOAD/STORE and vector manipulation ops for vectors |
311 | // with > 4 elements. |
312 | for (MVT VT : |
313 | {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, |
314 | MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, |
315 | MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, |
316 | MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32, |
317 | MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, |
318 | MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, |
319 | MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, |
320 | MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { |
321 | for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { |
322 | switch (Op) { |
323 | case ISD::LOAD: |
324 | case ISD::STORE: |
325 | case ISD::BUILD_VECTOR: |
326 | case ISD::BITCAST: |
327 | case ISD::UNDEF: |
328 | case ISD::EXTRACT_VECTOR_ELT: |
329 | case ISD::INSERT_VECTOR_ELT: |
330 | case ISD::SCALAR_TO_VECTOR: |
331 | case ISD::IS_FPCLASS: |
332 | break; |
333 | case ISD::EXTRACT_SUBVECTOR: |
334 | case ISD::INSERT_SUBVECTOR: |
335 | case ISD::CONCAT_VECTORS: |
336 | setOperationAction(Op, VT, Action: Custom); |
337 | break; |
338 | default: |
339 | setOperationAction(Op, VT, Action: Expand); |
340 | break; |
341 | } |
342 | } |
343 | } |
344 | |
345 | setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand); |
346 | |
347 | // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that |
348 | // is expanded to avoid having two separate loops in case the index is a VGPR. |
349 | |
350 | // Most operations are naturally 32-bit vector operations. We only support |
351 | // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. |
352 | for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { |
353 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote); |
354 | AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32); |
355 | |
356 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote); |
357 | AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32); |
358 | |
359 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote); |
360 | AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32); |
361 | |
362 | setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote); |
363 | AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32); |
364 | } |
365 | |
366 | for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) { |
367 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote); |
368 | AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32); |
369 | |
370 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote); |
371 | AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32); |
372 | |
373 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote); |
374 | AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32); |
375 | |
376 | setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote); |
377 | AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32); |
378 | } |
379 | |
380 | for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { |
381 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote); |
382 | AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32); |
383 | |
384 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote); |
385 | AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32); |
386 | |
387 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote); |
388 | AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32); |
389 | |
390 | setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote); |
391 | AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32); |
392 | } |
393 | |
394 | for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) { |
395 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote); |
396 | AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32); |
397 | |
398 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote); |
399 | AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32); |
400 | |
401 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote); |
402 | AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32); |
403 | |
404 | setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote); |
405 | AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32); |
406 | } |
407 | |
408 | for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) { |
409 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote); |
410 | AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32); |
411 | |
412 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote); |
413 | AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32); |
414 | |
415 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote); |
416 | AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32); |
417 | |
418 | setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote); |
419 | AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32); |
420 | } |
421 | |
422 | setOperationAction(Ops: ISD::VECTOR_SHUFFLE, |
423 | VTs: {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}, |
424 | Action: Expand); |
425 | |
426 | setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, |
427 | Action: Custom); |
428 | |
429 | // Avoid stack access for these. |
430 | // TODO: Generalize to more vector types. |
431 | setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, |
432 | VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8, |
433 | MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16}, |
434 | Action: Custom); |
435 | |
436 | // Deal with vec3 vector operations when widened to vec4. |
437 | setOperationAction(Ops: ISD::INSERT_SUBVECTOR, |
438 | VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom); |
439 | |
440 | // Deal with vec5/6/7 vector operations when widened to vec8. |
441 | setOperationAction(Ops: ISD::INSERT_SUBVECTOR, |
442 | VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32, |
443 | MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32, |
444 | MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, |
445 | MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, |
446 | Action: Custom); |
447 | |
448 | // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, |
449 | // and output demarshalling |
450 | setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom); |
451 | |
452 | // We can't return success/failure, only the old value, |
453 | // let LLVM add the comparison |
454 | setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64}, |
455 | Action: Expand); |
456 | |
457 | setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom); |
458 | |
459 | setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal); |
460 | |
461 | // FIXME: This should be narrowed to i32, but that only happens if i64 is |
462 | // illegal. |
463 | // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. |
464 | setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal); |
465 | |
466 | // On SI this is s_memtime and s_memrealtime on VI. |
467 | setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal); |
468 | |
469 | if (Subtarget->hasSMemRealTime() || |
470 | Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11) |
471 | setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal); |
472 | setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom); |
473 | |
474 | if (Subtarget->has16BitInsts()) { |
475 | setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote); |
476 | setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom); |
477 | } else { |
478 | setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom); |
479 | } |
480 | |
481 | if (Subtarget->hasMadMacF32Insts()) |
482 | setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal); |
483 | |
484 | if (!Subtarget->hasBFI()) |
485 | // fcopysign can be done in a single instruction with BFI. |
486 | setOperationAction(Ops: ISD::FCOPYSIGN, VTs: {MVT::f32, MVT::f64}, Action: Expand); |
487 | |
488 | if (!Subtarget->hasBCNT(Size: 32)) |
489 | setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Expand); |
490 | |
491 | if (!Subtarget->hasBCNT(Size: 64)) |
492 | setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Expand); |
493 | |
494 | if (Subtarget->hasFFBH()) |
495 | setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom); |
496 | |
497 | if (Subtarget->hasFFBL()) |
498 | setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom); |
499 | |
500 | // We only really have 32-bit BFE instructions (and 16-bit on VI). |
501 | // |
502 | // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any |
503 | // effort to match them now. We want this to be false for i64 cases when the |
504 | // extraction isn't restricted to the upper or lower half. Ideally we would |
505 | // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that |
506 | // span the midpoint are probably relatively rare, so don't worry about them |
507 | // for now. |
508 | if (Subtarget->hasBFE()) |
509 | setHasExtractBitsInsn(true); |
510 | |
511 | // Clamp modifier on add/sub |
512 | if (Subtarget->hasIntClamp()) |
513 | setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal); |
514 | |
515 | if (Subtarget->hasAddNoCarry()) |
516 | setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32}, |
517 | Action: Legal); |
518 | |
519 | setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, VTs: {MVT::f32, MVT::f64}, |
520 | Action: Custom); |
521 | |
522 | // These are really only legal for ieee_mode functions. We should be avoiding |
523 | // them for functions that don't have ieee_mode enabled, so just say they are |
524 | // legal. |
525 | setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, |
526 | VTs: {MVT::f32, MVT::f64}, Action: Legal); |
527 | |
528 | if (Subtarget->haveRoundOpsF64()) |
529 | setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64, |
530 | Action: Legal); |
531 | else |
532 | setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR}, |
533 | VT: MVT::f64, Action: Custom); |
534 | |
535 | setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal); |
536 | setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64}, |
537 | Action: Legal); |
538 | setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom); |
539 | |
540 | setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom); |
541 | setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom); |
542 | |
543 | setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand); |
544 | setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand); |
545 | |
546 | // Custom lower these because we can't specify a rule based on an illegal |
547 | // source bf16. |
548 | setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom); |
549 | setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom); |
550 | |
551 | if (Subtarget->has16BitInsts()) { |
552 | setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, |
553 | ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, |
554 | VT: MVT::i16, Action: Legal); |
555 | |
556 | AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32); |
557 | |
558 | setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC}, |
559 | VT: MVT::i16, Action: Expand); |
560 | |
561 | setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM, |
562 | ISD::UREM, ISD::BITREVERSE, ISD::CTTZ, |
563 | ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF, |
564 | ISD::CTPOP}, |
565 | VT: MVT::i16, Action: Promote); |
566 | |
567 | setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom); |
568 | |
569 | setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand); |
570 | |
571 | setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote); |
572 | AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32); |
573 | setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote); |
574 | AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32); |
575 | |
576 | setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom); |
577 | setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom); |
578 | setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom); |
579 | |
580 | setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom); |
581 | |
582 | // F16 - Constant Actions. |
583 | setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal); |
584 | setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal); |
585 | |
586 | // F16 - Load/Store Actions. |
587 | setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote); |
588 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16); |
589 | setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote); |
590 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16); |
591 | |
592 | // BF16 - Load/Store Actions. |
593 | setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote); |
594 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16); |
595 | setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote); |
596 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16); |
597 | |
598 | // F16 - VOP1 Actions. |
599 | setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS, |
600 | ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, |
601 | VT: MVT::f16, Action: Custom); |
602 | |
603 | setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::f16, Action: Promote); |
604 | setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::bf16, Action: Promote); |
605 | |
606 | // F16 - VOP2 Actions. |
607 | setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16}, |
608 | Action: Expand); |
609 | setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom); |
610 | setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom); |
611 | setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom); |
612 | |
613 | // F16 - VOP3 Actions. |
614 | setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal); |
615 | if (STI.hasMadF16()) |
616 | setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal); |
617 | |
618 | for (MVT VT : |
619 | {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16, |
620 | MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, |
621 | MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) { |
622 | for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { |
623 | switch (Op) { |
624 | case ISD::LOAD: |
625 | case ISD::STORE: |
626 | case ISD::BUILD_VECTOR: |
627 | case ISD::BITCAST: |
628 | case ISD::UNDEF: |
629 | case ISD::EXTRACT_VECTOR_ELT: |
630 | case ISD::INSERT_VECTOR_ELT: |
631 | case ISD::INSERT_SUBVECTOR: |
632 | case ISD::EXTRACT_SUBVECTOR: |
633 | case ISD::SCALAR_TO_VECTOR: |
634 | case ISD::IS_FPCLASS: |
635 | break; |
636 | case ISD::CONCAT_VECTORS: |
637 | setOperationAction(Op, VT, Action: Custom); |
638 | break; |
639 | default: |
640 | setOperationAction(Op, VT, Action: Expand); |
641 | break; |
642 | } |
643 | } |
644 | } |
645 | |
646 | // v_perm_b32 can handle either of these. |
647 | setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal); |
648 | setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom); |
649 | |
650 | // XXX - Do these do anything? Vector constants turn into build_vector. |
651 | setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal); |
652 | |
653 | setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, |
654 | Action: Legal); |
655 | |
656 | setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote); |
657 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32); |
658 | setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote); |
659 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32); |
660 | |
661 | setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote); |
662 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32); |
663 | setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote); |
664 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32); |
665 | |
666 | setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote); |
667 | AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32); |
668 | setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote); |
669 | AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32); |
670 | setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote); |
671 | AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32); |
672 | |
673 | setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote); |
674 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32); |
675 | setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote); |
676 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32); |
677 | setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote); |
678 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32); |
679 | |
680 | setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote); |
681 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32); |
682 | setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote); |
683 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32); |
684 | setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote); |
685 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32); |
686 | |
687 | setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote); |
688 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32); |
689 | setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote); |
690 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32); |
691 | setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote); |
692 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32); |
693 | |
694 | setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote); |
695 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32); |
696 | setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote); |
697 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32); |
698 | |
699 | setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote); |
700 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32); |
701 | setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote); |
702 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32); |
703 | setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote); |
704 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32); |
705 | |
706 | setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote); |
707 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32); |
708 | setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote); |
709 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32); |
710 | setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote); |
711 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32); |
712 | |
713 | setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote); |
714 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32); |
715 | setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote); |
716 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32); |
717 | setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote); |
718 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32); |
719 | |
720 | setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote); |
721 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32); |
722 | setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote); |
723 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32); |
724 | setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote); |
725 | AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32); |
726 | |
727 | setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote); |
728 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32); |
729 | setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote); |
730 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32); |
731 | setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote); |
732 | AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32); |
733 | |
734 | setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, |
735 | VT: MVT::v2i32, Action: Expand); |
736 | setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand); |
737 | |
738 | setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, |
739 | VT: MVT::v4i32, Action: Expand); |
740 | |
741 | setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, |
742 | VT: MVT::v8i32, Action: Expand); |
743 | |
744 | setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, |
745 | Action: Subtarget->hasVOP3PInsts() ? Legal : Custom); |
746 | |
747 | setOperationAction(Op: ISD::FNEG, VT: MVT::v2f16, Action: Legal); |
748 | // This isn't really legal, but this avoids the legalizer unrolling it (and |
749 | // allows matching fneg (fabs x) patterns) |
750 | setOperationAction(Op: ISD::FABS, VT: MVT::v2f16, Action: Legal); |
751 | |
752 | setOperationAction(Ops: {ISD::FMAXNUM, ISD::FMINNUM}, VT: MVT::f16, Action: Custom); |
753 | setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal); |
754 | |
755 | setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, |
756 | VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, |
757 | Action: Custom); |
758 | |
759 | setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, |
760 | VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, |
761 | Action: Expand); |
762 | |
763 | for (MVT Vec16 : |
764 | {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, |
765 | MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { |
766 | setOperationAction( |
767 | Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, |
768 | VT: Vec16, Action: Custom); |
769 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand); |
770 | } |
771 | } |
772 | |
773 | if (Subtarget->hasVOP3PInsts()) { |
774 | setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL, |
775 | ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX, |
776 | ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT}, |
777 | VT: MVT::v2i16, Action: Legal); |
778 | |
779 | setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE, |
780 | ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE}, |
781 | VT: MVT::v2f16, Action: Legal); |
782 | |
783 | setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, |
784 | Action: Custom); |
785 | |
786 | setOperationAction(Ops: ISD::VECTOR_SHUFFLE, |
787 | VTs: {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, |
788 | MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16}, |
789 | Action: Custom); |
790 | |
791 | for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) |
792 | // Split vector operations. |
793 | setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, |
794 | ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, |
795 | ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT, |
796 | ISD::SSUBSAT}, |
797 | VT, Action: Custom); |
798 | |
799 | for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) |
800 | // Split vector operations. |
801 | setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, |
802 | VT, Action: Custom); |
803 | |
804 | setOperationAction(Ops: {ISD::FMAXNUM, ISD::FMINNUM}, VTs: {MVT::v2f16, MVT::v4f16}, |
805 | Action: Custom); |
806 | |
807 | setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom); |
808 | setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, |
809 | Action: Custom); |
810 | |
811 | if (Subtarget->hasPackedFP32Ops()) { |
812 | setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, |
813 | VT: MVT::v2f32, Action: Legal); |
814 | setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA}, |
815 | VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32}, |
816 | Action: Custom); |
817 | } |
818 | } |
819 | |
820 | setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom); |
821 | |
822 | if (Subtarget->has16BitInsts()) { |
823 | setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote); |
824 | AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32); |
825 | setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote); |
826 | AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32); |
827 | } else { |
828 | // Legalization hack. |
829 | setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom); |
830 | |
831 | setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom); |
832 | } |
833 | |
834 | setOperationAction(Ops: ISD::SELECT, |
835 | VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8, |
836 | MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16, |
837 | MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16, |
838 | MVT::v32f16, MVT::v32bf16}, |
839 | Action: Custom); |
840 | |
841 | setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom); |
842 | |
843 | if (Subtarget->hasScalarSMulU64()) |
844 | setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom); |
845 | |
846 | if (Subtarget->hasMad64_32()) |
847 | setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom); |
848 | |
849 | if (Subtarget->hasPrefetch()) |
850 | setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom); |
851 | |
852 | if (Subtarget->hasIEEEMinMax()) { |
853 | setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, |
854 | VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal); |
855 | setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM}, |
856 | VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, |
857 | Action: Custom); |
858 | } |
859 | |
860 | setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN, |
861 | VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, |
862 | MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128, |
863 | MVT::i8}, |
864 | Action: Custom); |
865 | |
866 | setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN, |
867 | VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16, |
868 | MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16, |
869 | MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16, |
870 | MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, |
871 | Action: Custom); |
872 | |
873 | setOperationAction(Ops: ISD::INTRINSIC_VOID, |
874 | VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16, |
875 | MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, |
876 | MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, |
877 | MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128}, |
878 | Action: Custom); |
879 | |
880 | setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom); |
881 | setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom); |
882 | setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom); |
883 | setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom); |
884 | setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom); |
885 | |
886 | // TODO: Could move this to custom lowering, could benefit from combines on |
887 | // extract of relevant bits. |
888 | setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal); |
889 | |
890 | setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote); |
891 | |
892 | setTargetDAGCombine({ISD::ADD, |
893 | ISD::UADDO_CARRY, |
894 | ISD::SUB, |
895 | ISD::USUBO_CARRY, |
896 | ISD::FADD, |
897 | ISD::FSUB, |
898 | ISD::FDIV, |
899 | ISD::FMINNUM, |
900 | ISD::FMAXNUM, |
901 | ISD::FMINNUM_IEEE, |
902 | ISD::FMAXNUM_IEEE, |
903 | ISD::FMINIMUM, |
904 | ISD::FMAXIMUM, |
905 | ISD::FMA, |
906 | ISD::SMIN, |
907 | ISD::SMAX, |
908 | ISD::UMIN, |
909 | ISD::UMAX, |
910 | ISD::SETCC, |
911 | ISD::AND, |
912 | ISD::OR, |
913 | ISD::XOR, |
914 | ISD::FSHR, |
915 | ISD::SINT_TO_FP, |
916 | ISD::UINT_TO_FP, |
917 | ISD::FCANONICALIZE, |
918 | ISD::SCALAR_TO_VECTOR, |
919 | ISD::ZERO_EXTEND, |
920 | ISD::SIGN_EXTEND_INREG, |
921 | ISD::EXTRACT_VECTOR_ELT, |
922 | ISD::INSERT_VECTOR_ELT, |
923 | ISD::FCOPYSIGN}); |
924 | |
925 | if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16()) |
926 | setTargetDAGCombine(ISD::FP_ROUND); |
927 | |
928 | // All memory operations. Some folding on the pointer operand is done to help |
929 | // matching the constant offsets in the addressing modes. |
930 | setTargetDAGCombine({ISD::LOAD, |
931 | ISD::STORE, |
932 | ISD::ATOMIC_LOAD, |
933 | ISD::ATOMIC_STORE, |
934 | ISD::ATOMIC_CMP_SWAP, |
935 | ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, |
936 | ISD::ATOMIC_SWAP, |
937 | ISD::ATOMIC_LOAD_ADD, |
938 | ISD::ATOMIC_LOAD_SUB, |
939 | ISD::ATOMIC_LOAD_AND, |
940 | ISD::ATOMIC_LOAD_OR, |
941 | ISD::ATOMIC_LOAD_XOR, |
942 | ISD::ATOMIC_LOAD_NAND, |
943 | ISD::ATOMIC_LOAD_MIN, |
944 | ISD::ATOMIC_LOAD_MAX, |
945 | ISD::ATOMIC_LOAD_UMIN, |
946 | ISD::ATOMIC_LOAD_UMAX, |
947 | ISD::ATOMIC_LOAD_FADD, |
948 | ISD::ATOMIC_LOAD_FMIN, |
949 | ISD::ATOMIC_LOAD_FMAX, |
950 | ISD::ATOMIC_LOAD_UINC_WRAP, |
951 | ISD::ATOMIC_LOAD_UDEC_WRAP, |
952 | ISD::INTRINSIC_VOID, |
953 | ISD::INTRINSIC_W_CHAIN}); |
954 | |
955 | // FIXME: In other contexts we pretend this is a per-function property. |
956 | setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); |
957 | |
958 | setSchedulingPreference(Sched::RegPressure); |
959 | } |
960 | |
961 | const GCNSubtarget *SITargetLowering::getSubtarget() const { |
962 | return Subtarget; |
963 | } |
964 | |
965 | ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { |
966 | static const MCPhysReg RCRegs[] = {AMDGPU::MODE}; |
967 | return RCRegs; |
968 | } |
969 | |
970 | //===----------------------------------------------------------------------===// |
971 | // TargetLowering queries |
972 | //===----------------------------------------------------------------------===// |
973 | |
974 | // v_mad_mix* support a conversion from f16 to f32. |
975 | // |
976 | // There is only one special case when denormals are enabled we don't currently, |
977 | // where this is OK to use. |
978 | bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, |
979 | EVT DestVT, EVT SrcVT) const { |
980 | return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || |
981 | (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && |
982 | DestVT.getScalarType() == MVT::f32 && |
983 | SrcVT.getScalarType() == MVT::f16 && |
984 | // TODO: This probably only requires no input flushing? |
985 | denormalModeIsFlushAllF32(MF: DAG.getMachineFunction()); |
986 | } |
987 | |
988 | bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, |
989 | LLT DestTy, LLT SrcTy) const { |
990 | return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || |
991 | (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && |
992 | DestTy.getScalarSizeInBits() == 32 && |
993 | SrcTy.getScalarSizeInBits() == 16 && |
994 | // TODO: This probably only requires no input flushing? |
995 | denormalModeIsFlushAllF32(MF: *MI.getMF()); |
996 | } |
997 | |
998 | bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { |
999 | // SI has some legal vector types, but no legal vector operations. Say no |
1000 | // shuffles are legal in order to prefer scalarizing some vector operations. |
1001 | return false; |
1002 | } |
1003 | |
1004 | MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, |
1005 | CallingConv::ID CC, |
1006 | EVT VT) const { |
1007 | if (CC == CallingConv::AMDGPU_KERNEL) |
1008 | return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); |
1009 | |
1010 | if (VT.isVector()) { |
1011 | EVT ScalarVT = VT.getScalarType(); |
1012 | unsigned Size = ScalarVT.getSizeInBits(); |
1013 | if (Size == 16) { |
1014 | if (Subtarget->has16BitInsts()) { |
1015 | if (VT.isInteger()) |
1016 | return MVT::v2i16; |
1017 | return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16); |
1018 | } |
1019 | return VT.isInteger() ? MVT::i32 : MVT::f32; |
1020 | } |
1021 | |
1022 | if (Size < 16) |
1023 | return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; |
1024 | return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; |
1025 | } |
1026 | |
1027 | if (VT.getSizeInBits() > 32) |
1028 | return MVT::i32; |
1029 | |
1030 | return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); |
1031 | } |
1032 | |
1033 | unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, |
1034 | CallingConv::ID CC, |
1035 | EVT VT) const { |
1036 | if (CC == CallingConv::AMDGPU_KERNEL) |
1037 | return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); |
1038 | |
1039 | if (VT.isVector()) { |
1040 | unsigned NumElts = VT.getVectorNumElements(); |
1041 | EVT ScalarVT = VT.getScalarType(); |
1042 | unsigned Size = ScalarVT.getSizeInBits(); |
1043 | |
1044 | // FIXME: Should probably promote 8-bit vectors to i16. |
1045 | if (Size == 16 && Subtarget->has16BitInsts()) |
1046 | return (NumElts + 1) / 2; |
1047 | |
1048 | if (Size <= 32) |
1049 | return NumElts; |
1050 | |
1051 | if (Size > 32) |
1052 | return NumElts * ((Size + 31) / 32); |
1053 | } else if (VT.getSizeInBits() > 32) |
1054 | return (VT.getSizeInBits() + 31) / 32; |
1055 | |
1056 | return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); |
1057 | } |
1058 | |
1059 | unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( |
1060 | LLVMContext &Context, CallingConv::ID CC, |
1061 | EVT VT, EVT &IntermediateVT, |
1062 | unsigned &NumIntermediates, MVT &RegisterVT) const { |
1063 | if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { |
1064 | unsigned NumElts = VT.getVectorNumElements(); |
1065 | EVT ScalarVT = VT.getScalarType(); |
1066 | unsigned Size = ScalarVT.getSizeInBits(); |
1067 | // FIXME: We should fix the ABI to be the same on targets without 16-bit |
1068 | // support, but unless we can properly handle 3-vectors, it will be still be |
1069 | // inconsistent. |
1070 | if (Size == 16 && Subtarget->has16BitInsts()) { |
1071 | if (ScalarVT == MVT::bf16) { |
1072 | RegisterVT = MVT::i32; |
1073 | IntermediateVT = MVT::v2bf16; |
1074 | } else { |
1075 | RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; |
1076 | IntermediateVT = RegisterVT; |
1077 | } |
1078 | NumIntermediates = (NumElts + 1) / 2; |
1079 | return NumIntermediates; |
1080 | } |
1081 | |
1082 | if (Size == 32) { |
1083 | RegisterVT = ScalarVT.getSimpleVT(); |
1084 | IntermediateVT = RegisterVT; |
1085 | NumIntermediates = NumElts; |
1086 | return NumIntermediates; |
1087 | } |
1088 | |
1089 | if (Size < 16 && Subtarget->has16BitInsts()) { |
1090 | // FIXME: Should probably form v2i16 pieces |
1091 | RegisterVT = MVT::i16; |
1092 | IntermediateVT = ScalarVT; |
1093 | NumIntermediates = NumElts; |
1094 | return NumIntermediates; |
1095 | } |
1096 | |
1097 | |
1098 | if (Size != 16 && Size <= 32) { |
1099 | RegisterVT = MVT::i32; |
1100 | IntermediateVT = ScalarVT; |
1101 | NumIntermediates = NumElts; |
1102 | return NumIntermediates; |
1103 | } |
1104 | |
1105 | if (Size > 32) { |
1106 | RegisterVT = MVT::i32; |
1107 | IntermediateVT = RegisterVT; |
1108 | NumIntermediates = NumElts * ((Size + 31) / 32); |
1109 | return NumIntermediates; |
1110 | } |
1111 | } |
1112 | |
1113 | return TargetLowering::getVectorTypeBreakdownForCallingConv( |
1114 | Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); |
1115 | } |
1116 | |
1117 | static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, |
1118 | const DataLayout &DL, Type *Ty, |
1119 | unsigned MaxNumLanes) { |
1120 | assert(MaxNumLanes != 0); |
1121 | |
1122 | LLVMContext &Ctx = Ty->getContext(); |
1123 | if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) { |
1124 | unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements()); |
1125 | return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()), |
1126 | NumElements: NumElts); |
1127 | } |
1128 | |
1129 | return TLI.getValueType(DL, Ty); |
1130 | } |
1131 | |
1132 | // Peek through TFE struct returns to only use the data size. |
1133 | static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, |
1134 | const DataLayout &DL, Type *Ty, |
1135 | unsigned MaxNumLanes) { |
1136 | auto *ST = dyn_cast<StructType>(Val: Ty); |
1137 | if (!ST) |
1138 | return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes); |
1139 | |
1140 | // TFE intrinsics return an aggregate type. |
1141 | assert(ST->getNumContainedTypes() == 2 && |
1142 | ST->getContainedType(1)->isIntegerTy(32)); |
1143 | return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: 0), MaxNumLanes); |
1144 | } |
1145 | |
1146 | /// Map address space 7 to MVT::v5i32 because that's its in-memory |
1147 | /// representation. This return value is vector-typed because there is no |
1148 | /// MVT::i160 and it is not clear if one can be added. While this could |
1149 | /// cause issues during codegen, these address space 7 pointers will be |
1150 | /// rewritten away by then. Therefore, we can return MVT::v5i32 in order |
1151 | /// to allow pre-codegen passes that query TargetTransformInfo, often for cost |
1152 | /// modeling, to work. |
1153 | MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { |
1154 | if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) |
1155 | return MVT::v5i32; |
1156 | if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && |
1157 | DL.getPointerSizeInBits(AS) == 192) |
1158 | return MVT::v6i32; |
1159 | return AMDGPUTargetLowering::getPointerTy(DL, AS); |
1160 | } |
1161 | /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka |
1162 | /// v8i32 when padding is added. |
1163 | /// The in-memory representation of a p9 is {p8, i32, i32}, which is |
1164 | /// also v8i32 with padding. |
1165 | MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { |
1166 | if ((AMDGPUAS::BUFFER_FAT_POINTER == AS && |
1167 | DL.getPointerSizeInBits(AS) == 160) || |
1168 | (AMDGPUAS::BUFFER_STRIDED_POINTER == AS && |
1169 | DL.getPointerSizeInBits(AS) == 192)) |
1170 | return MVT::v8i32; |
1171 | return AMDGPUTargetLowering::getPointerMemTy(DL, AS); |
1172 | } |
1173 | |
1174 | bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, |
1175 | const CallInst &CI, |
1176 | MachineFunction &MF, |
1177 | unsigned IntrID) const { |
1178 | Info.flags = MachineMemOperand::MONone; |
1179 | if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load)) |
1180 | Info.flags |= MachineMemOperand::MOInvariant; |
1181 | |
1182 | if (const AMDGPU::RsrcIntrinsic *RsrcIntr = |
1183 | AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) { |
1184 | AttributeList Attr = Intrinsic::getAttributes(C&: CI.getContext(), |
1185 | id: (Intrinsic::ID)IntrID); |
1186 | MemoryEffects ME = Attr.getMemoryEffects(); |
1187 | if (ME.doesNotAccessMemory()) |
1188 | return false; |
1189 | |
1190 | // TODO: Should images get their own address space? |
1191 | Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; |
1192 | |
1193 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr; |
1194 | if (RsrcIntr->IsImage) { |
1195 | const AMDGPU::ImageDimIntrinsicInfo *Intr = |
1196 | AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID); |
1197 | BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode); |
1198 | Info.align.reset(); |
1199 | } |
1200 | |
1201 | Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg); |
1202 | if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) { |
1203 | if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) |
1204 | // We conservatively set the memory operand of a buffer intrinsic to the |
1205 | // base resource pointer, so that we can access alias information about |
1206 | // those pointers. Cases like "this points at the same value |
1207 | // but with a different offset" are handled in |
1208 | // areMemAccessesTriviallyDisjoint. |
1209 | Info.ptrVal = RsrcArg; |
1210 | } |
1211 | |
1212 | auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1)); |
1213 | if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) |
1214 | Info.flags |= MachineMemOperand::MOVolatile; |
1215 | Info.flags |= MachineMemOperand::MODereferenceable; |
1216 | if (ME.onlyReadsMemory()) { |
1217 | if (RsrcIntr->IsImage) { |
1218 | unsigned MaxNumLanes = 4; |
1219 | |
1220 | if (!BaseOpcode->Gather4) { |
1221 | // If this isn't a gather, we may have excess loaded elements in the |
1222 | // IR type. Check the dmask for the real number of elements loaded. |
1223 | unsigned DMask |
1224 | = cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue(); |
1225 | MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask); |
1226 | } |
1227 | |
1228 | Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), |
1229 | Ty: CI.getType(), MaxNumLanes); |
1230 | } else { |
1231 | Info.memVT = |
1232 | memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(), |
1233 | MaxNumLanes: std::numeric_limits<unsigned>::max()); |
1234 | } |
1235 | |
1236 | // FIXME: What does alignment mean for an image? |
1237 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1238 | Info.flags |= MachineMemOperand::MOLoad; |
1239 | } else if (ME.onlyWritesMemory()) { |
1240 | Info.opc = ISD::INTRINSIC_VOID; |
1241 | |
1242 | Type *DataTy = CI.getArgOperand(i: 0)->getType(); |
1243 | if (RsrcIntr->IsImage) { |
1244 | unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue(); |
1245 | unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask); |
1246 | Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy, |
1247 | MaxNumLanes: DMaskLanes); |
1248 | } else |
1249 | Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy); |
1250 | |
1251 | Info.flags |= MachineMemOperand::MOStore; |
1252 | } else { |
1253 | // Atomic or NoReturn Sampler |
1254 | Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID : |
1255 | ISD::INTRINSIC_W_CHAIN; |
1256 | Info.flags |= MachineMemOperand::MOLoad | |
1257 | MachineMemOperand::MOStore | |
1258 | MachineMemOperand::MODereferenceable; |
1259 | |
1260 | switch (IntrID) { |
1261 | default: |
1262 | if (RsrcIntr->IsImage && BaseOpcode->NoReturn) { |
1263 | // Fake memory access type for no return sampler intrinsics |
1264 | Info.memVT = MVT::i32; |
1265 | } else { |
1266 | // XXX - Should this be volatile without known ordering? |
1267 | Info.flags |= MachineMemOperand::MOVolatile; |
1268 | Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType()); |
1269 | } |
1270 | break; |
1271 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
1272 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: |
1273 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
1274 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { |
1275 | unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue(); |
1276 | Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8); |
1277 | Info.ptrVal = CI.getArgOperand(i: 1); |
1278 | return true; |
1279 | } |
1280 | case Intrinsic::amdgcn_raw_atomic_buffer_load: |
1281 | case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: { |
1282 | Info.memVT = |
1283 | memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(), |
1284 | MaxNumLanes: std::numeric_limits<unsigned>::max()); |
1285 | Info.flags &= ~MachineMemOperand::MOStore; |
1286 | return true; |
1287 | } |
1288 | } |
1289 | } |
1290 | return true; |
1291 | } |
1292 | |
1293 | switch (IntrID) { |
1294 | case Intrinsic::amdgcn_ds_ordered_add: |
1295 | case Intrinsic::amdgcn_ds_ordered_swap: { |
1296 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1297 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1298 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1299 | Info.align.reset(); |
1300 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1301 | |
1302 | const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4)); |
1303 | if (!Vol->isZero()) |
1304 | Info.flags |= MachineMemOperand::MOVolatile; |
1305 | |
1306 | return true; |
1307 | } |
1308 | case Intrinsic::amdgcn_ds_add_gs_reg_rtn: |
1309 | case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { |
1310 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1311 | Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType()); |
1312 | Info.ptrVal = nullptr; |
1313 | Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER; |
1314 | Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1315 | return true; |
1316 | } |
1317 | case Intrinsic::amdgcn_ds_append: |
1318 | case Intrinsic::amdgcn_ds_consume: { |
1319 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1320 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1321 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1322 | Info.align.reset(); |
1323 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1324 | |
1325 | const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1)); |
1326 | if (!Vol->isZero()) |
1327 | Info.flags |= MachineMemOperand::MOVolatile; |
1328 | |
1329 | return true; |
1330 | } |
1331 | case Intrinsic::amdgcn_global_atomic_csub: { |
1332 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1333 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1334 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1335 | Info.align.reset(); |
1336 | Info.flags |= MachineMemOperand::MOLoad | |
1337 | MachineMemOperand::MOStore | |
1338 | MachineMemOperand::MOVolatile; |
1339 | return true; |
1340 | } |
1341 | case Intrinsic::amdgcn_image_bvh_intersect_ray: { |
1342 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1343 | Info.memVT = MVT::getVT(Ty: CI.getType()); // XXX: what is correct VT? |
1344 | |
1345 | Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; |
1346 | Info.align.reset(); |
1347 | Info.flags |= MachineMemOperand::MOLoad | |
1348 | MachineMemOperand::MODereferenceable; |
1349 | return true; |
1350 | } |
1351 | case Intrinsic::amdgcn_global_atomic_fadd: |
1352 | case Intrinsic::amdgcn_global_atomic_fmin: |
1353 | case Intrinsic::amdgcn_global_atomic_fmax: |
1354 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
1355 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
1356 | case Intrinsic::amdgcn_global_atomic_ordered_add_b64: |
1357 | case Intrinsic::amdgcn_flat_atomic_fadd: |
1358 | case Intrinsic::amdgcn_flat_atomic_fmin: |
1359 | case Intrinsic::amdgcn_flat_atomic_fmax: |
1360 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
1361 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
1362 | case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: |
1363 | case Intrinsic::amdgcn_atomic_cond_sub_u32: |
1364 | case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { |
1365 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1366 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1367 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1368 | Info.align.reset(); |
1369 | Info.flags |= MachineMemOperand::MOLoad | |
1370 | MachineMemOperand::MOStore | |
1371 | MachineMemOperand::MODereferenceable | |
1372 | MachineMemOperand::MOVolatile; |
1373 | return true; |
1374 | } |
1375 | case Intrinsic::amdgcn_global_load_tr_b64: |
1376 | case Intrinsic::amdgcn_global_load_tr_b128: { |
1377 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1378 | Info.memVT = MVT::getVT(Ty: CI.getType()); |
1379 | Info.ptrVal = CI.getOperand(i_nocapture: 0); |
1380 | Info.align.reset(); |
1381 | Info.flags |= MachineMemOperand::MOLoad; |
1382 | return true; |
1383 | } |
1384 | case Intrinsic::amdgcn_ds_gws_init: |
1385 | case Intrinsic::amdgcn_ds_gws_barrier: |
1386 | case Intrinsic::amdgcn_ds_gws_sema_v: |
1387 | case Intrinsic::amdgcn_ds_gws_sema_br: |
1388 | case Intrinsic::amdgcn_ds_gws_sema_p: |
1389 | case Intrinsic::amdgcn_ds_gws_sema_release_all: { |
1390 | Info.opc = ISD::INTRINSIC_VOID; |
1391 | |
1392 | const GCNTargetMachine &TM = |
1393 | static_cast<const GCNTargetMachine &>(getTargetMachine()); |
1394 | |
1395 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1396 | Info.ptrVal = MFI->getGWSPSV(TM); |
1397 | |
1398 | // This is an abstract access, but we need to specify a type and size. |
1399 | Info.memVT = MVT::i32; |
1400 | Info.size = 4; |
1401 | Info.align = Align(4); |
1402 | |
1403 | if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) |
1404 | Info.flags |= MachineMemOperand::MOLoad; |
1405 | else |
1406 | Info.flags |= MachineMemOperand::MOStore; |
1407 | return true; |
1408 | } |
1409 | case Intrinsic::amdgcn_global_load_lds: { |
1410 | Info.opc = ISD::INTRINSIC_VOID; |
1411 | unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue(); |
1412 | Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8); |
1413 | Info.ptrVal = CI.getArgOperand(i: 1); |
1414 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1415 | return true; |
1416 | } |
1417 | case Intrinsic::amdgcn_ds_bvh_stack_rtn: { |
1418 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
1419 | |
1420 | const GCNTargetMachine &TM = |
1421 | static_cast<const GCNTargetMachine &>(getTargetMachine()); |
1422 | |
1423 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
1424 | Info.ptrVal = MFI->getGWSPSV(TM); |
1425 | |
1426 | // This is an abstract access, but we need to specify a type and size. |
1427 | Info.memVT = MVT::i32; |
1428 | Info.size = 4; |
1429 | Info.align = Align(4); |
1430 | |
1431 | Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
1432 | return true; |
1433 | } |
1434 | default: |
1435 | return false; |
1436 | } |
1437 | } |
1438 | |
1439 | void SITargetLowering::CollectTargetIntrinsicOperands( |
1440 | const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const { |
1441 | switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) { |
1442 | case Intrinsic::amdgcn_addrspacecast_nonnull: { |
1443 | // The DAG's ValueType loses the addrspaces. |
1444 | // Add them as 2 extra Constant operands "from" and "to". |
1445 | unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace(); |
1446 | unsigned DstAS = I.getType()->getPointerAddressSpace(); |
1447 | Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc(), VT: MVT::i32)); |
1448 | Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc(), VT: MVT::i32)); |
1449 | break; |
1450 | } |
1451 | default: |
1452 | break; |
1453 | } |
1454 | } |
1455 | |
1456 | bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, |
1457 | SmallVectorImpl<Value*> &Ops, |
1458 | Type *&AccessTy) const { |
1459 | Value *Ptr = nullptr; |
1460 | switch (II->getIntrinsicID()) { |
1461 | case Intrinsic::amdgcn_atomic_cond_sub_u32: |
1462 | case Intrinsic::amdgcn_ds_append: |
1463 | case Intrinsic::amdgcn_ds_consume: |
1464 | case Intrinsic::amdgcn_ds_ordered_add: |
1465 | case Intrinsic::amdgcn_ds_ordered_swap: |
1466 | case Intrinsic::amdgcn_flat_atomic_fadd: |
1467 | case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: |
1468 | case Intrinsic::amdgcn_flat_atomic_fmax: |
1469 | case Intrinsic::amdgcn_flat_atomic_fmax_num: |
1470 | case Intrinsic::amdgcn_flat_atomic_fmin: |
1471 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
1472 | case Intrinsic::amdgcn_global_atomic_csub: |
1473 | case Intrinsic::amdgcn_global_atomic_fadd: |
1474 | case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: |
1475 | case Intrinsic::amdgcn_global_atomic_fmax: |
1476 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
1477 | case Intrinsic::amdgcn_global_atomic_fmin: |
1478 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
1479 | case Intrinsic::amdgcn_global_atomic_ordered_add_b64: |
1480 | case Intrinsic::amdgcn_global_load_tr_b64: |
1481 | case Intrinsic::amdgcn_global_load_tr_b128: |
1482 | Ptr = II->getArgOperand(i: 0); |
1483 | break; |
1484 | case Intrinsic::amdgcn_global_load_lds: |
1485 | Ptr = II->getArgOperand(i: 1); |
1486 | break; |
1487 | default: |
1488 | return false; |
1489 | } |
1490 | AccessTy = II->getType(); |
1491 | Ops.push_back(Elt: Ptr); |
1492 | return true; |
1493 | } |
1494 | |
1495 | bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, |
1496 | unsigned AddrSpace) const { |
1497 | if (!Subtarget->hasFlatInstOffsets()) { |
1498 | // Flat instructions do not have offsets, and only have the register |
1499 | // address. |
1500 | return AM.BaseOffs == 0 && AM.Scale == 0; |
1501 | } |
1502 | |
1503 | decltype(SIInstrFlags::FLAT) FlatVariant = |
1504 | AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal |
1505 | : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch |
1506 | : SIInstrFlags::FLAT; |
1507 | |
1508 | return AM.Scale == 0 && |
1509 | (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( |
1510 | Offset: AM.BaseOffs, AddrSpace, FlatVariant)); |
1511 | } |
1512 | |
1513 | bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { |
1514 | if (Subtarget->hasFlatGlobalInsts()) |
1515 | return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS); |
1516 | |
1517 | if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { |
1518 | // Assume the we will use FLAT for all global memory accesses |
1519 | // on VI. |
1520 | // FIXME: This assumption is currently wrong. On VI we still use |
1521 | // MUBUF instructions for the r + i addressing mode. As currently |
1522 | // implemented, the MUBUF instructions only work on buffer < 4GB. |
1523 | // It may be possible to support > 4GB buffers with MUBUF instructions, |
1524 | // by setting the stride value in the resource descriptor which would |
1525 | // increase the size limit to (stride * 4GB). However, this is risky, |
1526 | // because it has never been validated. |
1527 | return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS); |
1528 | } |
1529 | |
1530 | return isLegalMUBUFAddressingMode(AM); |
1531 | } |
1532 | |
1533 | bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { |
1534 | // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and |
1535 | // additionally can do r + r + i with addr64. 32-bit has more addressing |
1536 | // mode options. Depending on the resource constant, it can also do |
1537 | // (i64 r0) + (i32 r1) * (i14 i). |
1538 | // |
1539 | // Private arrays end up using a scratch buffer most of the time, so also |
1540 | // assume those use MUBUF instructions. Scratch loads / stores are currently |
1541 | // implemented as mubuf instructions with offen bit set, so slightly |
1542 | // different than the normal addr64. |
1543 | const SIInstrInfo *TII = Subtarget->getInstrInfo(); |
1544 | if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs)) |
1545 | return false; |
1546 | |
1547 | // FIXME: Since we can split immediate into soffset and immediate offset, |
1548 | // would it make sense to allow any immediate? |
1549 | |
1550 | switch (AM.Scale) { |
1551 | case 0: // r + i or just i, depending on HasBaseReg. |
1552 | return true; |
1553 | case 1: |
1554 | return true; // We have r + r or r + i. |
1555 | case 2: |
1556 | if (AM.HasBaseReg) { |
1557 | // Reject 2 * r + r. |
1558 | return false; |
1559 | } |
1560 | |
1561 | // Allow 2 * r as r + r |
1562 | // Or 2 * r + i is allowed as r + r + i. |
1563 | return true; |
1564 | default: // Don't allow n * r |
1565 | return false; |
1566 | } |
1567 | } |
1568 | |
1569 | bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, |
1570 | const AddrMode &AM, Type *Ty, |
1571 | unsigned AS, Instruction *I) const { |
1572 | // No global is ever allowed as a base. |
1573 | if (AM.BaseGV) |
1574 | return false; |
1575 | |
1576 | if (AS == AMDGPUAS::GLOBAL_ADDRESS) |
1577 | return isLegalGlobalAddressingMode(AM); |
1578 | |
1579 | if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
1580 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
1581 | AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE || |
1582 | AS == AMDGPUAS::BUFFER_STRIDED_POINTER) { |
1583 | // If the offset isn't a multiple of 4, it probably isn't going to be |
1584 | // correctly aligned. |
1585 | // FIXME: Can we get the real alignment here? |
1586 | if (AM.BaseOffs % 4 != 0) |
1587 | return isLegalMUBUFAddressingMode(AM); |
1588 | |
1589 | if (!Subtarget->hasScalarSubwordLoads()) { |
1590 | // There are no SMRD extloads, so if we have to do a small type access we |
1591 | // will use a MUBUF load. |
1592 | // FIXME?: We also need to do this if unaligned, but we don't know the |
1593 | // alignment here. |
1594 | if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) |
1595 | return isLegalGlobalAddressingMode(AM); |
1596 | } |
1597 | |
1598 | if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { |
1599 | // SMRD instructions have an 8-bit, dword offset on SI. |
1600 | if (!isUInt<8>(x: AM.BaseOffs / 4)) |
1601 | return false; |
1602 | } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { |
1603 | // On CI+, this can also be a 32-bit literal constant offset. If it fits |
1604 | // in 8-bits, it can use a smaller encoding. |
1605 | if (!isUInt<32>(x: AM.BaseOffs / 4)) |
1606 | return false; |
1607 | } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) { |
1608 | // On VI, these use the SMEM format and the offset is 20-bit in bytes. |
1609 | if (!isUInt<20>(x: AM.BaseOffs)) |
1610 | return false; |
1611 | } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) { |
1612 | // On GFX9 the offset is signed 21-bit in bytes (but must not be negative |
1613 | // for S_BUFFER_* instructions). |
1614 | if (!isInt<21>(x: AM.BaseOffs)) |
1615 | return false; |
1616 | } else { |
1617 | // On GFX12, all offsets are signed 24-bit in bytes. |
1618 | if (!isInt<24>(x: AM.BaseOffs)) |
1619 | return false; |
1620 | } |
1621 | |
1622 | if ((AS == AMDGPUAS::CONSTANT_ADDRESS || |
1623 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && |
1624 | AM.BaseOffs < 0) { |
1625 | // Scalar (non-buffer) loads can only use a negative offset if |
1626 | // soffset+offset is non-negative. Since the compiler can only prove that |
1627 | // in a few special cases, it is safer to claim that negative offsets are |
1628 | // not supported. |
1629 | return false; |
1630 | } |
1631 | |
1632 | if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. |
1633 | return true; |
1634 | |
1635 | if (AM.Scale == 1 && AM.HasBaseReg) |
1636 | return true; |
1637 | |
1638 | return false; |
1639 | } |
1640 | |
1641 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) |
1642 | return Subtarget->enableFlatScratch() |
1643 | ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS) |
1644 | : isLegalMUBUFAddressingMode(AM); |
1645 | |
1646 | if (AS == AMDGPUAS::LOCAL_ADDRESS || |
1647 | (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) { |
1648 | // Basic, single offset DS instructions allow a 16-bit unsigned immediate |
1649 | // field. |
1650 | // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have |
1651 | // an 8-bit dword offset but we don't know the alignment here. |
1652 | if (!isUInt<16>(x: AM.BaseOffs)) |
1653 | return false; |
1654 | |
1655 | if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. |
1656 | return true; |
1657 | |
1658 | if (AM.Scale == 1 && AM.HasBaseReg) |
1659 | return true; |
1660 | |
1661 | return false; |
1662 | } |
1663 | |
1664 | if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { |
1665 | // For an unknown address space, this usually means that this is for some |
1666 | // reason being used for pure arithmetic, and not based on some addressing |
1667 | // computation. We don't have instructions that compute pointers with any |
1668 | // addressing modes, so treat them as having no offset like flat |
1669 | // instructions. |
1670 | return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS); |
1671 | } |
1672 | |
1673 | // Assume a user alias of global for unknown address spaces. |
1674 | return isLegalGlobalAddressingMode(AM); |
1675 | } |
1676 | |
1677 | bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, |
1678 | const MachineFunction &MF) const { |
1679 | if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) |
1680 | return (MemVT.getSizeInBits() <= 4 * 32); |
1681 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
1682 | unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); |
1683 | return (MemVT.getSizeInBits() <= MaxPrivateBits); |
1684 | } |
1685 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) |
1686 | return (MemVT.getSizeInBits() <= 2 * 32); |
1687 | return true; |
1688 | } |
1689 | |
1690 | bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( |
1691 | unsigned Size, unsigned AddrSpace, Align Alignment, |
1692 | MachineMemOperand::Flags Flags, unsigned *IsFast) const { |
1693 | if (IsFast) |
1694 | *IsFast = 0; |
1695 | |
1696 | if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
1697 | AddrSpace == AMDGPUAS::REGION_ADDRESS) { |
1698 | // Check if alignment requirements for ds_read/write instructions are |
1699 | // disabled. |
1700 | if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) |
1701 | return false; |
1702 | |
1703 | Align RequiredAlignment(PowerOf2Ceil(A: Size/8)); // Natural alignment. |
1704 | if (Subtarget->hasLDSMisalignedBug() && Size > 32 && |
1705 | Alignment < RequiredAlignment) |
1706 | return false; |
1707 | |
1708 | // Either, the alignment requirements are "enabled", or there is an |
1709 | // unaligned LDS access related hardware bug though alignment requirements |
1710 | // are "disabled". In either case, we need to check for proper alignment |
1711 | // requirements. |
1712 | // |
1713 | switch (Size) { |
1714 | case 64: |
1715 | // SI has a hardware bug in the LDS / GDS bounds checking: if the base |
1716 | // address is negative, then the instruction is incorrectly treated as |
1717 | // out-of-bounds even if base + offsets is in bounds. Split vectorized |
1718 | // loads here to avoid emitting ds_read2_b32. We may re-combine the |
1719 | // load later in the SILoadStoreOptimizer. |
1720 | if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8)) |
1721 | return false; |
1722 | |
1723 | // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we |
1724 | // can do a 4 byte aligned, 8 byte access in a single operation using |
1725 | // ds_read2/write2_b32 with adjacent offsets. |
1726 | RequiredAlignment = Align(4); |
1727 | |
1728 | if (Subtarget->hasUnalignedDSAccessEnabled()) { |
1729 | // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/ |
1730 | // ds_write2_b32 depending on the alignment. In either case with either |
1731 | // alignment there is no faster way of doing this. |
1732 | |
1733 | // The numbers returned here and below are not additive, it is a 'speed |
1734 | // rank'. They are just meant to be compared to decide if a certain way |
1735 | // of lowering an operation is faster than another. For that purpose |
1736 | // naturally aligned operation gets it bitsize to indicate that "it |
1737 | // operates with a speed comparable to N-bit wide load". With the full |
1738 | // alignment ds128 is slower than ds96 for example. If underaligned it |
1739 | // is comparable to a speed of a single dword access, which would then |
1740 | // mean 32 < 128 and it is faster to issue a wide load regardless. |
1741 | // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a |
1742 | // wider load which will not be aligned anymore the latter is slower. |
1743 | if (IsFast) |
1744 | *IsFast = (Alignment >= RequiredAlignment) ? 64 |
1745 | : (Alignment < Align(4)) ? 32 |
1746 | : 1; |
1747 | return true; |
1748 | } |
1749 | |
1750 | break; |
1751 | case 96: |
1752 | if (!Subtarget->hasDS96AndDS128()) |
1753 | return false; |
1754 | |
1755 | // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on |
1756 | // gfx8 and older. |
1757 | |
1758 | if (Subtarget->hasUnalignedDSAccessEnabled()) { |
1759 | // Naturally aligned access is fastest. However, also report it is Fast |
1760 | // if memory is aligned less than DWORD. A narrow load or store will be |
1761 | // be equally slow as a single ds_read_b96/ds_write_b96, but there will |
1762 | // be more of them, so overall we will pay less penalty issuing a single |
1763 | // instruction. |
1764 | |
1765 | // See comment on the values above. |
1766 | if (IsFast) |
1767 | *IsFast = (Alignment >= RequiredAlignment) ? 96 |
1768 | : (Alignment < Align(4)) ? 32 |
1769 | : 1; |
1770 | return true; |
1771 | } |
1772 | |
1773 | break; |
1774 | case 128: |
1775 | if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) |
1776 | return false; |
1777 | |
1778 | // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on |
1779 | // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a |
1780 | // single operation using ds_read2/write2_b64. |
1781 | RequiredAlignment = Align(8); |
1782 | |
1783 | if (Subtarget->hasUnalignedDSAccessEnabled()) { |
1784 | // Naturally aligned access is fastest. However, also report it is Fast |
1785 | // if memory is aligned less than DWORD. A narrow load or store will be |
1786 | // be equally slow as a single ds_read_b128/ds_write_b128, but there |
1787 | // will be more of them, so overall we will pay less penalty issuing a |
1788 | // single instruction. |
1789 | |
1790 | // See comment on the values above. |
1791 | if (IsFast) |
1792 | *IsFast = (Alignment >= RequiredAlignment) ? 128 |
1793 | : (Alignment < Align(4)) ? 32 |
1794 | : 1; |
1795 | return true; |
1796 | } |
1797 | |
1798 | break; |
1799 | default: |
1800 | if (Size > 32) |
1801 | return false; |
1802 | |
1803 | break; |
1804 | } |
1805 | |
1806 | // See comment on the values above. |
1807 | // Note that we have a single-dword or sub-dword here, so if underaligned |
1808 | // it is a slowest possible access, hence returned value is 0. |
1809 | if (IsFast) |
1810 | *IsFast = (Alignment >= RequiredAlignment) ? Size : 0; |
1811 | |
1812 | return Alignment >= RequiredAlignment || |
1813 | Subtarget->hasUnalignedDSAccessEnabled(); |
1814 | } |
1815 | |
1816 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { |
1817 | bool AlignedBy4 = Alignment >= Align(4); |
1818 | if (IsFast) |
1819 | *IsFast = AlignedBy4; |
1820 | |
1821 | return AlignedBy4 || |
1822 | Subtarget->enableFlatScratch() || |
1823 | Subtarget->hasUnalignedScratchAccess(); |
1824 | } |
1825 | |
1826 | // FIXME: We have to be conservative here and assume that flat operations |
1827 | // will access scratch. If we had access to the IR function, then we |
1828 | // could determine if any private memory was used in the function. |
1829 | if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && |
1830 | !Subtarget->hasUnalignedScratchAccess()) { |
1831 | bool AlignedBy4 = Alignment >= Align(4); |
1832 | if (IsFast) |
1833 | *IsFast = AlignedBy4; |
1834 | |
1835 | return AlignedBy4; |
1836 | } |
1837 | |
1838 | // So long as they are correct, wide global memory operations perform better |
1839 | // than multiple smaller memory ops -- even when misaligned |
1840 | if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) { |
1841 | if (IsFast) |
1842 | *IsFast = Size; |
1843 | |
1844 | return Alignment >= Align(4) || |
1845 | Subtarget->hasUnalignedBufferAccessEnabled(); |
1846 | } |
1847 | |
1848 | // Smaller than dword value must be aligned. |
1849 | if (Size < 32) |
1850 | return false; |
1851 | |
1852 | // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the |
1853 | // byte-address are ignored, thus forcing Dword alignment. |
1854 | // This applies to private, global, and constant memory. |
1855 | if (IsFast) |
1856 | *IsFast = 1; |
1857 | |
1858 | return Size >= 32 && Alignment >= Align(4); |
1859 | } |
1860 | |
1861 | bool SITargetLowering::allowsMisalignedMemoryAccesses( |
1862 | EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, |
1863 | unsigned *IsFast) const { |
1864 | return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace, |
1865 | Alignment, Flags, IsFast); |
1866 | } |
1867 | |
1868 | EVT SITargetLowering::getOptimalMemOpType( |
1869 | const MemOp &Op, const AttributeList &FuncAttributes) const { |
1870 | // FIXME: Should account for address space here. |
1871 | |
1872 | // The default fallback uses the private pointer size as a guess for a type to |
1873 | // use. Make sure we switch these to 64-bit accesses. |
1874 | |
1875 | if (Op.size() >= 16 && |
1876 | Op.isDstAligned(AlignCheck: Align(4))) // XXX: Should only do for global |
1877 | return MVT::v4i32; |
1878 | |
1879 | if (Op.size() >= 8 && Op.isDstAligned(AlignCheck: Align(4))) |
1880 | return MVT::v2i32; |
1881 | |
1882 | // Use the default. |
1883 | return MVT::Other; |
1884 | } |
1885 | |
1886 | bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { |
1887 | const MemSDNode *MemNode = cast<MemSDNode>(Val: N); |
1888 | return MemNode->getMemOperand()->getFlags() & MONoClobber; |
1889 | } |
1890 | |
1891 | bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { |
1892 | return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS || |
1893 | AS == AMDGPUAS::PRIVATE_ADDRESS; |
1894 | } |
1895 | |
1896 | bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, |
1897 | unsigned DestAS) const { |
1898 | // Flat -> private/local is a simple truncate. |
1899 | // Flat -> global is no-op |
1900 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) |
1901 | return true; |
1902 | |
1903 | const GCNTargetMachine &TM = |
1904 | static_cast<const GCNTargetMachine &>(getTargetMachine()); |
1905 | return TM.isNoopAddrSpaceCast(SrcAS, DestAS); |
1906 | } |
1907 | |
1908 | bool SITargetLowering::isMemOpUniform(const SDNode *N) const { |
1909 | const MemSDNode *MemNode = cast<MemSDNode>(Val: N); |
1910 | |
1911 | return AMDGPUInstrInfo::isUniformMMO(MMO: MemNode->getMemOperand()); |
1912 | } |
1913 | |
1914 | TargetLoweringBase::LegalizeTypeAction |
1915 | SITargetLowering::getPreferredVectorAction(MVT VT) const { |
1916 | if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && |
1917 | VT.getScalarType().bitsLE(VT: MVT::i16)) |
1918 | return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; |
1919 | return TargetLoweringBase::getPreferredVectorAction(VT); |
1920 | } |
1921 | |
1922 | bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, |
1923 | Type *Ty) const { |
1924 | // FIXME: Could be smarter if called for vector constants. |
1925 | return true; |
1926 | } |
1927 | |
1928 | bool SITargetLowering::(EVT ResVT, EVT SrcVT, |
1929 | unsigned Index) const { |
1930 | if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT)) |
1931 | return false; |
1932 | |
1933 | // TODO: Add more cases that are cheap. |
1934 | return Index == 0; |
1935 | } |
1936 | |
1937 | bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { |
1938 | if (Subtarget->has16BitInsts() && VT == MVT::i16) { |
1939 | switch (Op) { |
1940 | case ISD::LOAD: |
1941 | case ISD::STORE: |
1942 | |
1943 | // These operations are done with 32-bit instructions anyway. |
1944 | case ISD::AND: |
1945 | case ISD::OR: |
1946 | case ISD::XOR: |
1947 | case ISD::SELECT: |
1948 | // TODO: Extensions? |
1949 | return true; |
1950 | default: |
1951 | return false; |
1952 | } |
1953 | } |
1954 | |
1955 | // SimplifySetCC uses this function to determine whether or not it should |
1956 | // create setcc with i1 operands. We don't have instructions for i1 setcc. |
1957 | if (VT == MVT::i1 && Op == ISD::SETCC) |
1958 | return false; |
1959 | |
1960 | return TargetLowering::isTypeDesirableForOp(Op, VT); |
1961 | } |
1962 | |
1963 | SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, |
1964 | const SDLoc &SL, |
1965 | SDValue Chain, |
1966 | uint64_t Offset) const { |
1967 | const DataLayout &DL = DAG.getDataLayout(); |
1968 | MachineFunction &MF = DAG.getMachineFunction(); |
1969 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
1970 | |
1971 | const ArgDescriptor *InputPtrReg; |
1972 | const TargetRegisterClass *RC; |
1973 | LLT ArgTy; |
1974 | MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS); |
1975 | |
1976 | std::tie(args&: InputPtrReg, args&: RC, args&: ArgTy) = |
1977 | Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
1978 | |
1979 | // We may not have the kernarg segment argument if we have no kernel |
1980 | // arguments. |
1981 | if (!InputPtrReg) |
1982 | return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT); |
1983 | |
1984 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
1985 | SDValue BasePtr = DAG.getCopyFromReg(Chain, dl: SL, |
1986 | Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT); |
1987 | |
1988 | return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset)); |
1989 | } |
1990 | |
1991 | SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, |
1992 | const SDLoc &SL) const { |
1993 | uint64_t Offset = getImplicitParameterOffset(MF: DAG.getMachineFunction(), |
1994 | Param: FIRST_IMPLICIT); |
1995 | return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset); |
1996 | } |
1997 | |
1998 | SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG, |
1999 | const SDLoc &SL) const { |
2000 | |
2001 | Function &F = DAG.getMachineFunction().getFunction(); |
2002 | std::optional<uint32_t> KnownSize = |
2003 | AMDGPUMachineFunction::getLDSKernelIdMetadata(F); |
2004 | if (KnownSize.has_value()) |
2005 | return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32); |
2006 | return SDValue(); |
2007 | } |
2008 | |
2009 | SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, |
2010 | const SDLoc &SL, SDValue Val, |
2011 | bool Signed, |
2012 | const ISD::InputArg *Arg) const { |
2013 | // First, if it is a widened vector, narrow it. |
2014 | if (VT.isVector() && |
2015 | VT.getVectorNumElements() != MemVT.getVectorNumElements()) { |
2016 | EVT NarrowedVT = |
2017 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), |
2018 | NumElements: VT.getVectorNumElements()); |
2019 | Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val, |
2020 | N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)); |
2021 | } |
2022 | |
2023 | // Then convert the vector elements or scalar value. |
2024 | if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && |
2025 | VT.bitsLT(VT: MemVT)) { |
2026 | unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; |
2027 | Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT)); |
2028 | } |
2029 | |
2030 | if (MemVT.isFloatingPoint()) |
2031 | Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT); |
2032 | else if (Signed) |
2033 | Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT); |
2034 | else |
2035 | Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT); |
2036 | |
2037 | return Val; |
2038 | } |
2039 | |
2040 | SDValue SITargetLowering::lowerKernargMemParameter( |
2041 | SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, |
2042 | uint64_t Offset, Align Alignment, bool Signed, |
2043 | const ISD::InputArg *Arg) const { |
2044 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
2045 | |
2046 | // Try to avoid using an extload by loading earlier than the argument address, |
2047 | // and extracting the relevant bits. The load should hopefully be merged with |
2048 | // the previous argument. |
2049 | if (MemVT.getStoreSize() < 4 && Alignment < 4) { |
2050 | // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). |
2051 | int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4); |
2052 | int64_t OffsetDiff = Offset - AlignDownOffset; |
2053 | |
2054 | EVT IntVT = MemVT.changeTypeToInteger(); |
2055 | |
2056 | // TODO: If we passed in the base kernel offset we could have a better |
2057 | // alignment than 4, but we don't really need it. |
2058 | SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset); |
2059 | SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr, PtrInfo, Alignment: Align(4), |
2060 | MMOFlags: MachineMemOperand::MODereferenceable | |
2061 | MachineMemOperand::MOInvariant); |
2062 | |
2063 | SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL: SL, VT: MVT::i32); |
2064 | SDValue = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt); |
2065 | |
2066 | SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract); |
2067 | ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal); |
2068 | ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg); |
2069 | |
2070 | |
2071 | return DAG.getMergeValues(Ops: { ArgVal, Load.getValue(R: 1) }, dl: SL); |
2072 | } |
2073 | |
2074 | SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); |
2075 | SDValue Load = DAG.getLoad(VT: MemVT, dl: SL, Chain, Ptr, PtrInfo, Alignment, |
2076 | MMOFlags: MachineMemOperand::MODereferenceable | |
2077 | MachineMemOperand::MOInvariant); |
2078 | |
2079 | SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg); |
2080 | return DAG.getMergeValues(Ops: { Val, Load.getValue(R: 1) }, dl: SL); |
2081 | } |
2082 | |
2083 | SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, |
2084 | const SDLoc &SL, SDValue Chain, |
2085 | const ISD::InputArg &Arg) const { |
2086 | MachineFunction &MF = DAG.getMachineFunction(); |
2087 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
2088 | |
2089 | if (Arg.Flags.isByVal()) { |
2090 | unsigned Size = Arg.Flags.getByValSize(); |
2091 | int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false); |
2092 | return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32); |
2093 | } |
2094 | |
2095 | unsigned ArgOffset = VA.getLocMemOffset(); |
2096 | unsigned ArgSize = VA.getValVT().getStoreSize(); |
2097 | |
2098 | int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true); |
2099 | |
2100 | // Create load nodes to retrieve arguments from the stack. |
2101 | SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32); |
2102 | SDValue ArgValue; |
2103 | |
2104 | // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) |
2105 | ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; |
2106 | MVT MemVT = VA.getValVT(); |
2107 | |
2108 | switch (VA.getLocInfo()) { |
2109 | default: |
2110 | break; |
2111 | case CCValAssign::BCvt: |
2112 | MemVT = VA.getLocVT(); |
2113 | break; |
2114 | case CCValAssign::SExt: |
2115 | ExtType = ISD::SEXTLOAD; |
2116 | break; |
2117 | case CCValAssign::ZExt: |
2118 | ExtType = ISD::ZEXTLOAD; |
2119 | break; |
2120 | case CCValAssign::AExt: |
2121 | ExtType = ISD::EXTLOAD; |
2122 | break; |
2123 | } |
2124 | |
2125 | ArgValue = DAG.getExtLoad( |
2126 | ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN, |
2127 | PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), |
2128 | MemVT); |
2129 | return ArgValue; |
2130 | } |
2131 | |
2132 | SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, |
2133 | const SIMachineFunctionInfo &MFI, |
2134 | EVT VT, |
2135 | AMDGPUFunctionArgInfo::PreloadedValue PVID) const { |
2136 | const ArgDescriptor *Reg = nullptr; |
2137 | const TargetRegisterClass *RC; |
2138 | LLT Ty; |
2139 | |
2140 | CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv(); |
2141 | const ArgDescriptor WorkGroupIDX = |
2142 | ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9); |
2143 | // If GridZ is not programmed in an entry function then the hardware will set |
2144 | // it to all zeros, so there is no need to mask the GridY value in the low |
2145 | // order bits. |
2146 | const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( |
2147 | Reg: AMDGPU::TTMP7, |
2148 | Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu); |
2149 | const ArgDescriptor WorkGroupIDZ = |
2150 | ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u); |
2151 | if (Subtarget->hasArchitectedSGPRs() && |
2152 | (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { |
2153 | switch (PVID) { |
2154 | case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: |
2155 | Reg = &WorkGroupIDX; |
2156 | RC = &AMDGPU::SReg_32RegClass; |
2157 | Ty = LLT::scalar(SizeInBits: 32); |
2158 | break; |
2159 | case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: |
2160 | Reg = &WorkGroupIDY; |
2161 | RC = &AMDGPU::SReg_32RegClass; |
2162 | Ty = LLT::scalar(SizeInBits: 32); |
2163 | break; |
2164 | case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: |
2165 | Reg = &WorkGroupIDZ; |
2166 | RC = &AMDGPU::SReg_32RegClass; |
2167 | Ty = LLT::scalar(SizeInBits: 32); |
2168 | break; |
2169 | default: |
2170 | break; |
2171 | } |
2172 | } |
2173 | |
2174 | if (!Reg) |
2175 | std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID); |
2176 | if (!Reg) { |
2177 | if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { |
2178 | // It's possible for a kernarg intrinsic call to appear in a kernel with |
2179 | // no allocated segment, in which case we do not add the user sgpr |
2180 | // argument, so just return null. |
2181 | return DAG.getConstant(Val: 0, DL: SDLoc(), VT); |
2182 | } |
2183 | |
2184 | // It's undefined behavior if a function marked with the amdgpu-no-* |
2185 | // attributes uses the corresponding intrinsic. |
2186 | return DAG.getUNDEF(VT); |
2187 | } |
2188 | |
2189 | return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg); |
2190 | } |
2191 | |
2192 | static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, |
2193 | CallingConv::ID CallConv, |
2194 | ArrayRef<ISD::InputArg> Ins, BitVector &Skipped, |
2195 | FunctionType *FType, |
2196 | SIMachineFunctionInfo *Info) { |
2197 | for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { |
2198 | const ISD::InputArg *Arg = &Ins[I]; |
2199 | |
2200 | assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && |
2201 | "vector type argument should have been split" ); |
2202 | |
2203 | // First check if it's a PS input addr. |
2204 | if (CallConv == CallingConv::AMDGPU_PS && |
2205 | !Arg->Flags.isInReg() && PSInputNum <= 15) { |
2206 | bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum); |
2207 | |
2208 | // Inconveniently only the first part of the split is marked as isSplit, |
2209 | // so skip to the end. We only want to increment PSInputNum once for the |
2210 | // entire split argument. |
2211 | if (Arg->Flags.isSplit()) { |
2212 | while (!Arg->Flags.isSplitEnd()) { |
2213 | assert((!Arg->VT.isVector() || |
2214 | Arg->VT.getScalarSizeInBits() == 16) && |
2215 | "unexpected vector split in ps argument type" ); |
2216 | if (!SkipArg) |
2217 | Splits.push_back(Elt: *Arg); |
2218 | Arg = &Ins[++I]; |
2219 | } |
2220 | } |
2221 | |
2222 | if (SkipArg) { |
2223 | // We can safely skip PS inputs. |
2224 | Skipped.set(Arg->getOrigArgIndex()); |
2225 | ++PSInputNum; |
2226 | continue; |
2227 | } |
2228 | |
2229 | Info->markPSInputAllocated(Index: PSInputNum); |
2230 | if (Arg->Used) |
2231 | Info->markPSInputEnabled(Index: PSInputNum); |
2232 | |
2233 | ++PSInputNum; |
2234 | } |
2235 | |
2236 | Splits.push_back(Elt: *Arg); |
2237 | } |
2238 | } |
2239 | |
2240 | // Allocate special inputs passed in VGPRs. |
2241 | void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, |
2242 | MachineFunction &MF, |
2243 | const SIRegisterInfo &TRI, |
2244 | SIMachineFunctionInfo &Info) const { |
2245 | const LLT S32 = LLT::scalar(SizeInBits: 32); |
2246 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
2247 | |
2248 | if (Info.hasWorkItemIDX()) { |
2249 | Register Reg = AMDGPU::VGPR0; |
2250 | MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32); |
2251 | |
2252 | CCInfo.AllocateReg(Reg); |
2253 | unsigned Mask = (Subtarget->hasPackedTID() && |
2254 | Info.hasWorkItemIDY()) ? 0x3ff : ~0u; |
2255 | Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); |
2256 | } |
2257 | |
2258 | if (Info.hasWorkItemIDY()) { |
2259 | assert(Info.hasWorkItemIDX()); |
2260 | if (Subtarget->hasPackedTID()) { |
2261 | Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, |
2262 | Mask: 0x3ff << 10)); |
2263 | } else { |
2264 | unsigned Reg = AMDGPU::VGPR1; |
2265 | MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32); |
2266 | |
2267 | CCInfo.AllocateReg(Reg); |
2268 | Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); |
2269 | } |
2270 | } |
2271 | |
2272 | if (Info.hasWorkItemIDZ()) { |
2273 | assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); |
2274 | if (Subtarget->hasPackedTID()) { |
2275 | Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, |
2276 | Mask: 0x3ff << 20)); |
2277 | } else { |
2278 | unsigned Reg = AMDGPU::VGPR2; |
2279 | MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32); |
2280 | |
2281 | CCInfo.AllocateReg(Reg); |
2282 | Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); |
2283 | } |
2284 | } |
2285 | } |
2286 | |
2287 | // Try to allocate a VGPR at the end of the argument list, or if no argument |
2288 | // VGPRs are left allocating a stack slot. |
2289 | // If \p Mask is is given it indicates bitfield position in the register. |
2290 | // If \p Arg is given use it with new ]p Mask instead of allocating new. |
2291 | static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, |
2292 | ArgDescriptor Arg = ArgDescriptor()) { |
2293 | if (Arg.isSet()) |
2294 | return ArgDescriptor::createArg(Arg, Mask); |
2295 | |
2296 | ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); |
2297 | unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs); |
2298 | if (RegIdx == ArgVGPRs.size()) { |
2299 | // Spill to stack required. |
2300 | int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4)); |
2301 | |
2302 | return ArgDescriptor::createStack(Offset, Mask); |
2303 | } |
2304 | |
2305 | unsigned Reg = ArgVGPRs[RegIdx]; |
2306 | Reg = CCInfo.AllocateReg(Reg); |
2307 | assert(Reg != AMDGPU::NoRegister); |
2308 | |
2309 | MachineFunction &MF = CCInfo.getMachineFunction(); |
2310 | Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass); |
2311 | MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32)); |
2312 | return ArgDescriptor::createRegister(Reg, Mask); |
2313 | } |
2314 | |
2315 | static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, |
2316 | const TargetRegisterClass *RC, |
2317 | unsigned NumArgRegs) { |
2318 | ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32); |
2319 | unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs); |
2320 | if (RegIdx == ArgSGPRs.size()) |
2321 | report_fatal_error(reason: "ran out of SGPRs for arguments" ); |
2322 | |
2323 | unsigned Reg = ArgSGPRs[RegIdx]; |
2324 | Reg = CCInfo.AllocateReg(Reg); |
2325 | assert(Reg != AMDGPU::NoRegister); |
2326 | |
2327 | MachineFunction &MF = CCInfo.getMachineFunction(); |
2328 | MF.addLiveIn(PReg: Reg, RC); |
2329 | return ArgDescriptor::createRegister(Reg); |
2330 | } |
2331 | |
2332 | // If this has a fixed position, we still should allocate the register in the |
2333 | // CCInfo state. Technically we could get away with this for values passed |
2334 | // outside of the normal argument range. |
2335 | static void allocateFixedSGPRInputImpl(CCState &CCInfo, |
2336 | const TargetRegisterClass *RC, |
2337 | MCRegister Reg) { |
2338 | Reg = CCInfo.AllocateReg(Reg); |
2339 | assert(Reg != AMDGPU::NoRegister); |
2340 | MachineFunction &MF = CCInfo.getMachineFunction(); |
2341 | MF.addLiveIn(PReg: Reg, RC); |
2342 | } |
2343 | |
2344 | static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { |
2345 | if (Arg) { |
2346 | allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, |
2347 | Reg: Arg.getRegister()); |
2348 | } else |
2349 | Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: 32); |
2350 | } |
2351 | |
2352 | static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { |
2353 | if (Arg) { |
2354 | allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, |
2355 | Reg: Arg.getRegister()); |
2356 | } else |
2357 | Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: 16); |
2358 | } |
2359 | |
2360 | /// Allocate implicit function VGPR arguments at the end of allocated user |
2361 | /// arguments. |
2362 | void SITargetLowering::allocateSpecialInputVGPRs( |
2363 | CCState &CCInfo, MachineFunction &MF, |
2364 | const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { |
2365 | const unsigned Mask = 0x3ff; |
2366 | ArgDescriptor Arg; |
2367 | |
2368 | if (Info.hasWorkItemIDX()) { |
2369 | Arg = allocateVGPR32Input(CCInfo, Mask); |
2370 | Info.setWorkItemIDX(Arg); |
2371 | } |
2372 | |
2373 | if (Info.hasWorkItemIDY()) { |
2374 | Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg); |
2375 | Info.setWorkItemIDY(Arg); |
2376 | } |
2377 | |
2378 | if (Info.hasWorkItemIDZ()) |
2379 | Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg)); |
2380 | } |
2381 | |
2382 | /// Allocate implicit function VGPR arguments in fixed registers. |
2383 | void SITargetLowering::allocateSpecialInputVGPRsFixed( |
2384 | CCState &CCInfo, MachineFunction &MF, |
2385 | const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { |
2386 | Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31); |
2387 | if (!Reg) |
2388 | report_fatal_error(reason: "failed to allocated VGPR for implicit arguments" ); |
2389 | |
2390 | const unsigned Mask = 0x3ff; |
2391 | Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); |
2392 | Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10)); |
2393 | Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20)); |
2394 | } |
2395 | |
2396 | void SITargetLowering::allocateSpecialInputSGPRs( |
2397 | CCState &CCInfo, |
2398 | MachineFunction &MF, |
2399 | const SIRegisterInfo &TRI, |
2400 | SIMachineFunctionInfo &Info) const { |
2401 | auto &ArgInfo = Info.getArgInfo(); |
2402 | const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); |
2403 | |
2404 | // TODO: Unify handling with private memory pointers. |
2405 | if (UserSGPRInfo.hasDispatchPtr()) |
2406 | allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr); |
2407 | |
2408 | const Module *M = MF.getFunction().getParent(); |
2409 | if (UserSGPRInfo.hasQueuePtr() && |
2410 | AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5) |
2411 | allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr); |
2412 | |
2413 | // Implicit arg ptr takes the place of the kernarg segment pointer. This is a |
2414 | // constant offset from the kernarg segment. |
2415 | if (Info.hasImplicitArgPtr()) |
2416 | allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr); |
2417 | |
2418 | if (UserSGPRInfo.hasDispatchID()) |
2419 | allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID); |
2420 | |
2421 | // flat_scratch_init is not applicable for non-kernel functions. |
2422 | |
2423 | if (Info.hasWorkGroupIDX()) |
2424 | allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX); |
2425 | |
2426 | if (Info.hasWorkGroupIDY()) |
2427 | allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY); |
2428 | |
2429 | if (Info.hasWorkGroupIDZ()) |
2430 | allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ); |
2431 | |
2432 | if (Info.hasLDSKernelId()) |
2433 | allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId); |
2434 | } |
2435 | |
2436 | // Allocate special inputs passed in user SGPRs. |
2437 | void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, |
2438 | MachineFunction &MF, |
2439 | const SIRegisterInfo &TRI, |
2440 | SIMachineFunctionInfo &Info) const { |
2441 | const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); |
2442 | if (UserSGPRInfo.hasImplicitBufferPtr()) { |
2443 | Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); |
2444 | MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass); |
2445 | CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg); |
2446 | } |
2447 | |
2448 | // FIXME: How should these inputs interact with inreg / custom SGPR inputs? |
2449 | if (UserSGPRInfo.hasPrivateSegmentBuffer()) { |
2450 | Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); |
2451 | MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass); |
2452 | CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg); |
2453 | } |
2454 | |
2455 | if (UserSGPRInfo.hasDispatchPtr()) { |
2456 | Register DispatchPtrReg = Info.addDispatchPtr(TRI); |
2457 | MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass); |
2458 | CCInfo.AllocateReg(Reg: DispatchPtrReg); |
2459 | } |
2460 | |
2461 | const Module *M = MF.getFunction().getParent(); |
2462 | if (UserSGPRInfo.hasQueuePtr() && |
2463 | AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5) { |
2464 | Register QueuePtrReg = Info.addQueuePtr(TRI); |
2465 | MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass); |
2466 | CCInfo.AllocateReg(Reg: QueuePtrReg); |
2467 | } |
2468 | |
2469 | if (UserSGPRInfo.hasKernargSegmentPtr()) { |
2470 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
2471 | Register InputPtrReg = Info.addKernargSegmentPtr(TRI); |
2472 | CCInfo.AllocateReg(Reg: InputPtrReg); |
2473 | |
2474 | Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass); |
2475 | MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64)); |
2476 | } |
2477 | |
2478 | if (UserSGPRInfo.hasDispatchID()) { |
2479 | Register DispatchIDReg = Info.addDispatchID(TRI); |
2480 | MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass); |
2481 | CCInfo.AllocateReg(Reg: DispatchIDReg); |
2482 | } |
2483 | |
2484 | if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { |
2485 | Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); |
2486 | MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass); |
2487 | CCInfo.AllocateReg(Reg: FlatScratchInitReg); |
2488 | } |
2489 | |
2490 | if (UserSGPRInfo.hasPrivateSegmentSize()) { |
2491 | Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI); |
2492 | MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass); |
2493 | CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg); |
2494 | } |
2495 | |
2496 | // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read |
2497 | // these from the dispatch pointer. |
2498 | } |
2499 | |
2500 | // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be |
2501 | // sequential starting from the first argument. |
2502 | void SITargetLowering::allocatePreloadKernArgSGPRs( |
2503 | CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, |
2504 | const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF, |
2505 | const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { |
2506 | Function &F = MF.getFunction(); |
2507 | unsigned LastExplicitArgOffset = |
2508 | MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset(); |
2509 | GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); |
2510 | bool InPreloadSequence = true; |
2511 | unsigned InIdx = 0; |
2512 | for (auto &Arg : F.args()) { |
2513 | if (!InPreloadSequence || !Arg.hasInRegAttr()) |
2514 | break; |
2515 | |
2516 | int ArgIdx = Arg.getArgNo(); |
2517 | // Don't preload non-original args or parts not in the current preload |
2518 | // sequence. |
2519 | if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() || |
2520 | (int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) |
2521 | break; |
2522 | |
2523 | for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && |
2524 | (int)Ins[InIdx].getOrigArgIndex() == ArgIdx; |
2525 | InIdx++) { |
2526 | assert(ArgLocs[ArgIdx].isMemLoc()); |
2527 | auto &ArgLoc = ArgLocs[InIdx]; |
2528 | const Align KernelArgBaseAlign = Align(16); |
2529 | unsigned ArgOffset = ArgLoc.getLocMemOffset(); |
2530 | Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset); |
2531 | unsigned NumAllocSGPRs = |
2532 | alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32; |
2533 | |
2534 | // Arg is preloaded into the previous SGPR. |
2535 | if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { |
2536 | Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( |
2537 | Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); |
2538 | continue; |
2539 | } |
2540 | |
2541 | unsigned Padding = ArgOffset - LastExplicitArgOffset; |
2542 | unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4; |
2543 | // Check for free user SGPRs for preloading. |
2544 | if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ > |
2545 | SGPRInfo.getNumFreeUserSGPRs()) { |
2546 | InPreloadSequence = false; |
2547 | break; |
2548 | } |
2549 | |
2550 | // Preload this argument. |
2551 | const TargetRegisterClass *RC = |
2552 | TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32); |
2553 | SmallVectorImpl<MCRegister> *PreloadRegs = |
2554 | Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs); |
2555 | |
2556 | if (PreloadRegs->size() > 1) |
2557 | RC = &AMDGPU::SGPR_32RegClass; |
2558 | for (auto &Reg : *PreloadRegs) { |
2559 | assert(Reg); |
2560 | MF.addLiveIn(PReg: Reg, RC); |
2561 | CCInfo.AllocateReg(Reg); |
2562 | } |
2563 | |
2564 | LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; |
2565 | } |
2566 | } |
2567 | } |
2568 | |
2569 | void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, |
2570 | const SIRegisterInfo &TRI, |
2571 | SIMachineFunctionInfo &Info) const { |
2572 | // Always allocate this last since it is a synthetic preload. |
2573 | if (Info.hasLDSKernelId()) { |
2574 | Register Reg = Info.addLDSKernelId(); |
2575 | MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass); |
2576 | CCInfo.AllocateReg(Reg); |
2577 | } |
2578 | } |
2579 | |
2580 | // Allocate special input registers that are initialized per-wave. |
2581 | void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, |
2582 | MachineFunction &MF, |
2583 | SIMachineFunctionInfo &Info, |
2584 | CallingConv::ID CallConv, |
2585 | bool IsShader) const { |
2586 | bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); |
2587 | if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { |
2588 | // Note: user SGPRs are handled by the front-end for graphics shaders |
2589 | // Pad up the used user SGPRs with dead inputs. |
2590 | |
2591 | // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately |
2592 | // before enabling architected SGPRs for workgroup IDs. |
2593 | assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget" ); |
2594 | |
2595 | unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); |
2596 | // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to |
2597 | // rely on it to reach 16 since if we end up having no stack usage, it will |
2598 | // not really be added. |
2599 | unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() + |
2600 | Info.hasWorkGroupIDY() + |
2601 | Info.hasWorkGroupIDZ() + |
2602 | Info.hasWorkGroupInfo(); |
2603 | for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) { |
2604 | Register Reg = Info.addReservedUserSGPR(); |
2605 | MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass); |
2606 | CCInfo.AllocateReg(Reg); |
2607 | } |
2608 | } |
2609 | |
2610 | if (!HasArchitectedSGPRs) { |
2611 | if (Info.hasWorkGroupIDX()) { |
2612 | Register Reg = Info.addWorkGroupIDX(); |
2613 | MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass); |
2614 | CCInfo.AllocateReg(Reg); |
2615 | } |
2616 | |
2617 | if (Info.hasWorkGroupIDY()) { |
2618 | Register Reg = Info.addWorkGroupIDY(); |
2619 | MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass); |
2620 | CCInfo.AllocateReg(Reg); |
2621 | } |
2622 | |
2623 | if (Info.hasWorkGroupIDZ()) { |
2624 | Register Reg = Info.addWorkGroupIDZ(); |
2625 | MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass); |
2626 | CCInfo.AllocateReg(Reg); |
2627 | } |
2628 | } |
2629 | |
2630 | if (Info.hasWorkGroupInfo()) { |
2631 | Register Reg = Info.addWorkGroupInfo(); |
2632 | MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass); |
2633 | CCInfo.AllocateReg(Reg); |
2634 | } |
2635 | |
2636 | if (Info.hasPrivateSegmentWaveByteOffset()) { |
2637 | // Scratch wave offset passed in system SGPR. |
2638 | unsigned PrivateSegmentWaveByteOffsetReg; |
2639 | |
2640 | if (IsShader) { |
2641 | PrivateSegmentWaveByteOffsetReg = |
2642 | Info.getPrivateSegmentWaveByteOffsetSystemSGPR(); |
2643 | |
2644 | // This is true if the scratch wave byte offset doesn't have a fixed |
2645 | // location. |
2646 | if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) { |
2647 | PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); |
2648 | Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); |
2649 | } |
2650 | } else |
2651 | PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); |
2652 | |
2653 | MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass); |
2654 | CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg); |
2655 | } |
2656 | |
2657 | assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || |
2658 | Info.getNumPreloadedSGPRs() >= 16); |
2659 | } |
2660 | |
2661 | static void reservePrivateMemoryRegs(const TargetMachine &TM, |
2662 | MachineFunction &MF, |
2663 | const SIRegisterInfo &TRI, |
2664 | SIMachineFunctionInfo &Info) { |
2665 | // Now that we've figured out where the scratch register inputs are, see if |
2666 | // should reserve the arguments and use them directly. |
2667 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
2668 | bool HasStackObjects = MFI.hasStackObjects(); |
2669 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
2670 | |
2671 | // Record that we know we have non-spill stack objects so we don't need to |
2672 | // check all stack objects later. |
2673 | if (HasStackObjects) |
2674 | Info.setHasNonSpillStackObjects(true); |
2675 | |
2676 | // Everything live out of a block is spilled with fast regalloc, so it's |
2677 | // almost certain that spilling will be required. |
2678 | if (TM.getOptLevel() == CodeGenOptLevel::None) |
2679 | HasStackObjects = true; |
2680 | |
2681 | // For now assume stack access is needed in any callee functions, so we need |
2682 | // the scratch registers to pass in. |
2683 | bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); |
2684 | |
2685 | if (!ST.enableFlatScratch()) { |
2686 | if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) { |
2687 | // If we have stack objects, we unquestionably need the private buffer |
2688 | // resource. For the Code Object V2 ABI, this will be the first 4 user |
2689 | // SGPR inputs. We can reserve those and use them directly. |
2690 | |
2691 | Register PrivateSegmentBufferReg = |
2692 | Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); |
2693 | Info.setScratchRSrcReg(PrivateSegmentBufferReg); |
2694 | } else { |
2695 | unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); |
2696 | // We tentatively reserve the last registers (skipping the last registers |
2697 | // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, |
2698 | // we'll replace these with the ones immediately after those which were |
2699 | // really allocated. In the prologue copies will be inserted from the |
2700 | // argument to these reserved registers. |
2701 | |
2702 | // Without HSA, relocations are used for the scratch pointer and the |
2703 | // buffer resource setup is always inserted in the prologue. Scratch wave |
2704 | // offset is still in an input SGPR. |
2705 | Info.setScratchRSrcReg(ReservedBufferReg); |
2706 | } |
2707 | } |
2708 | |
2709 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
2710 | |
2711 | // For entry functions we have to set up the stack pointer if we use it, |
2712 | // whereas non-entry functions get this "for free". This means there is no |
2713 | // intrinsic advantage to using S32 over S34 in cases where we do not have |
2714 | // calls but do need a frame pointer (i.e. if we are requested to have one |
2715 | // because frame pointer elimination is disabled). To keep things simple we |
2716 | // only ever use S32 as the call ABI stack pointer, and so using it does not |
2717 | // imply we need a separate frame pointer. |
2718 | // |
2719 | // Try to use s32 as the SP, but move it if it would interfere with input |
2720 | // arguments. This won't work with calls though. |
2721 | // |
2722 | // FIXME: Move SP to avoid any possible inputs, or find a way to spill input |
2723 | // registers. |
2724 | if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) { |
2725 | Info.setStackPtrOffsetReg(AMDGPU::SGPR32); |
2726 | } else { |
2727 | assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); |
2728 | |
2729 | if (MFI.hasCalls()) |
2730 | report_fatal_error(reason: "call in graphics shader with too many input SGPRs" ); |
2731 | |
2732 | for (unsigned Reg : AMDGPU::SGPR_32RegClass) { |
2733 | if (!MRI.isLiveIn(Reg)) { |
2734 | Info.setStackPtrOffsetReg(Reg); |
2735 | break; |
2736 | } |
2737 | } |
2738 | |
2739 | if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) |
2740 | report_fatal_error(reason: "failed to find register for SP" ); |
2741 | } |
2742 | |
2743 | // hasFP should be accurate for entry functions even before the frame is |
2744 | // finalized, because it does not rely on the known stack size, only |
2745 | // properties like whether variable sized objects are present. |
2746 | if (ST.getFrameLowering()->hasFP(MF)) { |
2747 | Info.setFrameOffsetReg(AMDGPU::SGPR33); |
2748 | } |
2749 | } |
2750 | |
2751 | bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { |
2752 | const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
2753 | return !Info->isEntryFunction(); |
2754 | } |
2755 | |
2756 | void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { |
2757 | |
2758 | } |
2759 | |
2760 | void SITargetLowering::insertCopiesSplitCSR( |
2761 | MachineBasicBlock *Entry, |
2762 | const SmallVectorImpl<MachineBasicBlock *> &Exits) const { |
2763 | const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
2764 | |
2765 | const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent()); |
2766 | if (!IStart) |
2767 | return; |
2768 | |
2769 | const TargetInstrInfo *TII = Subtarget->getInstrInfo(); |
2770 | MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); |
2771 | MachineBasicBlock::iterator MBBI = Entry->begin(); |
2772 | for (const MCPhysReg *I = IStart; *I; ++I) { |
2773 | const TargetRegisterClass *RC = nullptr; |
2774 | if (AMDGPU::SReg_64RegClass.contains(Reg: *I)) |
2775 | RC = &AMDGPU::SGPR_64RegClass; |
2776 | else if (AMDGPU::SReg_32RegClass.contains(Reg: *I)) |
2777 | RC = &AMDGPU::SGPR_32RegClass; |
2778 | else |
2779 | llvm_unreachable("Unexpected register class in CSRsViaCopy!" ); |
2780 | |
2781 | Register NewVR = MRI->createVirtualRegister(RegClass: RC); |
2782 | // Create copy from CSR to a virtual register. |
2783 | Entry->addLiveIn(PhysReg: *I); |
2784 | BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR) |
2785 | .addReg(RegNo: *I); |
2786 | |
2787 | // Insert the copy-back instructions right before the terminator. |
2788 | for (auto *Exit : Exits) |
2789 | BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(), |
2790 | MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I) |
2791 | .addReg(RegNo: NewVR); |
2792 | } |
2793 | } |
2794 | |
2795 | SDValue SITargetLowering::LowerFormalArguments( |
2796 | SDValue Chain, CallingConv::ID CallConv, bool isVarArg, |
2797 | const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
2798 | SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
2799 | const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
2800 | |
2801 | MachineFunction &MF = DAG.getMachineFunction(); |
2802 | const Function &Fn = MF.getFunction(); |
2803 | FunctionType *FType = MF.getFunction().getFunctionType(); |
2804 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
2805 | |
2806 | if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) { |
2807 | DiagnosticInfoUnsupported NoGraphicsHSA( |
2808 | Fn, "unsupported non-compute shaders with HSA" , DL.getDebugLoc()); |
2809 | DAG.getContext()->diagnose(DI: NoGraphicsHSA); |
2810 | return DAG.getEntryNode(); |
2811 | } |
2812 | |
2813 | SmallVector<ISD::InputArg, 16> Splits; |
2814 | SmallVector<CCValAssign, 16> ArgLocs; |
2815 | BitVector Skipped(Ins.size()); |
2816 | CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, |
2817 | *DAG.getContext()); |
2818 | |
2819 | bool IsGraphics = AMDGPU::isGraphics(CC: CallConv); |
2820 | bool IsKernel = AMDGPU::isKernel(CC: CallConv); |
2821 | bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv); |
2822 | |
2823 | if (IsGraphics) { |
2824 | const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); |
2825 | assert(!UserSGPRInfo.hasDispatchPtr() && |
2826 | !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && |
2827 | !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && |
2828 | !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); |
2829 | (void)UserSGPRInfo; |
2830 | if (!Subtarget->enableFlatScratch()) |
2831 | assert(!UserSGPRInfo.hasFlatScratchInit()); |
2832 | if ((CallConv != CallingConv::AMDGPU_CS && |
2833 | CallConv != CallingConv::AMDGPU_Gfx) || |
2834 | !Subtarget->hasArchitectedSGPRs()) |
2835 | assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && |
2836 | !Info->hasWorkGroupIDZ()); |
2837 | } |
2838 | |
2839 | if (CallConv == CallingConv::AMDGPU_PS) { |
2840 | processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); |
2841 | |
2842 | // At least one interpolation mode must be enabled or else the GPU will |
2843 | // hang. |
2844 | // |
2845 | // Check PSInputAddr instead of PSInputEnable. The idea is that if the user |
2846 | // set PSInputAddr, the user wants to enable some bits after the compilation |
2847 | // based on run-time states. Since we can't know what the final PSInputEna |
2848 | // will look like, so we shouldn't do anything here and the user should take |
2849 | // responsibility for the correct programming. |
2850 | // |
2851 | // Otherwise, the following restrictions apply: |
2852 | // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. |
2853 | // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be |
2854 | // enabled too. |
2855 | if ((Info->getPSInputAddr() & 0x7F) == 0 || |
2856 | ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) { |
2857 | CCInfo.AllocateReg(Reg: AMDGPU::VGPR0); |
2858 | CCInfo.AllocateReg(Reg: AMDGPU::VGPR1); |
2859 | Info->markPSInputAllocated(Index: 0); |
2860 | Info->markPSInputEnabled(Index: 0); |
2861 | } |
2862 | if (Subtarget->isAmdPalOS()) { |
2863 | // For isAmdPalOS, the user does not enable some bits after compilation |
2864 | // based on run-time states; the register values being generated here are |
2865 | // the final ones set in hardware. Therefore we need to apply the |
2866 | // workaround to PSInputAddr and PSInputEnable together. (The case where |
2867 | // a bit is set in PSInputAddr but not PSInputEnable is where the |
2868 | // frontend set up an input arg for a particular interpolation mode, but |
2869 | // nothing uses that input arg. Really we should have an earlier pass |
2870 | // that removes such an arg.) |
2871 | unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); |
2872 | if ((PsInputBits & 0x7F) == 0 || |
2873 | ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) |
2874 | Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr())); |
2875 | } |
2876 | } else if (IsKernel) { |
2877 | assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); |
2878 | } else { |
2879 | Splits.append(in_start: Ins.begin(), in_end: Ins.end()); |
2880 | } |
2881 | |
2882 | if (IsKernel) |
2883 | analyzeFormalArgumentsCompute(State&: CCInfo, Ins); |
2884 | |
2885 | if (IsEntryFunc) { |
2886 | allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2887 | allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2888 | if (IsKernel && Subtarget->hasKernargPreload()) |
2889 | allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info); |
2890 | |
2891 | allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2892 | } else if (!IsGraphics) { |
2893 | // For the fixed ABI, pass workitem IDs in the last argument register. |
2894 | allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2895 | |
2896 | // FIXME: Sink this into allocateSpecialInputSGPRs |
2897 | if (!Subtarget->enableFlatScratch()) |
2898 | CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg()); |
2899 | |
2900 | allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info); |
2901 | } |
2902 | |
2903 | if (!IsKernel) { |
2904 | CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg); |
2905 | CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn); |
2906 | } |
2907 | |
2908 | SmallVector<SDValue, 16> Chains; |
2909 | |
2910 | // FIXME: This is the minimum kernel argument alignment. We should improve |
2911 | // this to the maximum alignment of the arguments. |
2912 | // |
2913 | // FIXME: Alignment of explicit arguments totally broken with non-0 explicit |
2914 | // kern arg offset. |
2915 | const Align KernelArgBaseAlign = Align(16); |
2916 | |
2917 | for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { |
2918 | const ISD::InputArg &Arg = Ins[i]; |
2919 | if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { |
2920 | InVals.push_back(Elt: DAG.getUNDEF(VT: Arg.VT)); |
2921 | continue; |
2922 | } |
2923 | |
2924 | CCValAssign &VA = ArgLocs[ArgIdx++]; |
2925 | MVT VT = VA.getLocVT(); |
2926 | |
2927 | if (IsEntryFunc && VA.isMemLoc()) { |
2928 | VT = Ins[i].VT; |
2929 | EVT MemVT = VA.getLocVT(); |
2930 | |
2931 | const uint64_t Offset = VA.getLocMemOffset(); |
2932 | Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset); |
2933 | |
2934 | if (Arg.Flags.isByRef()) { |
2935 | SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset); |
2936 | |
2937 | const GCNTargetMachine &TM = |
2938 | static_cast<const GCNTargetMachine &>(getTargetMachine()); |
2939 | if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS, |
2940 | DestAS: Arg.Flags.getPointerAddrSpace())) { |
2941 | Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS, |
2942 | DestAS: Arg.Flags.getPointerAddrSpace()); |
2943 | } |
2944 | |
2945 | InVals.push_back(Elt: Ptr); |
2946 | continue; |
2947 | } |
2948 | |
2949 | SDValue NewArg; |
2950 | if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) { |
2951 | if (MemVT.getStoreSize() < 4 && Alignment < 4) { |
2952 | // In this case the argument is packed into the previous preload SGPR. |
2953 | int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4); |
2954 | int64_t OffsetDiff = Offset - AlignDownOffset; |
2955 | EVT IntVT = MemVT.changeTypeToInteger(); |
2956 | |
2957 | const SIMachineFunctionInfo *Info = |
2958 | MF.getInfo<SIMachineFunctionInfo>(); |
2959 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
2960 | Register Reg = |
2961 | Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0]; |
2962 | |
2963 | assert(Reg); |
2964 | Register VReg = MRI.getLiveInVirtReg(PReg: Reg); |
2965 | SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32); |
2966 | |
2967 | SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL, VT: MVT::i32); |
2968 | SDValue = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt); |
2969 | |
2970 | SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract); |
2971 | ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal); |
2972 | NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal, |
2973 | Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]); |
2974 | |
2975 | NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL); |
2976 | } else { |
2977 | const SIMachineFunctionInfo *Info = |
2978 | MF.getInfo<SIMachineFunctionInfo>(); |
2979 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
2980 | const SmallVectorImpl<MCRegister> &PreloadRegs = |
2981 | Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs; |
2982 | |
2983 | SDValue Copy; |
2984 | if (PreloadRegs.size() == 1) { |
2985 | Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]); |
2986 | const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg); |
2987 | NewArg = DAG.getCopyFromReg( |
2988 | Chain, dl: DL, Reg: VReg, |
2989 | VT: EVT::getIntegerVT(Context&: *DAG.getContext(), |
2990 | BitWidth: TRI->getRegSizeInBits(RC: *RC))); |
2991 | |
2992 | } else { |
2993 | // If the kernarg alignment does not match the alignment of the SGPR |
2994 | // tuple RC that can accommodate this argument, it will be built up |
2995 | // via copies from from the individual SGPRs that the argument was |
2996 | // preloaded to. |
2997 | SmallVector<SDValue, 4> Elts; |
2998 | for (auto Reg : PreloadRegs) { |
2999 | Register VReg = MRI.getLiveInVirtReg(PReg: Reg); |
3000 | Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32); |
3001 | Elts.push_back(Elt: Copy); |
3002 | } |
3003 | NewArg = |
3004 | DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, |
3005 | NumElements: PreloadRegs.size()), |
3006 | DL, Ops: Elts); |
3007 | } |
3008 | |
3009 | // If the argument was preloaded to multiple consecutive 32-bit |
3010 | // registers because of misalignment between addressable SGPR tuples |
3011 | // and the argument size, we can still assume that because of kernarg |
3012 | // segment alignment restrictions that NewArg's size is the same as |
3013 | // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a |
3014 | // truncate since we cannot preload to less than a single SGPR and the |
3015 | // MemVT may be smaller. |
3016 | EVT MemVTInt = |
3017 | EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits()); |
3018 | if (MemVT.bitsLT(VT: NewArg.getSimpleValueType())) |
3019 | NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg); |
3020 | |
3021 | NewArg = DAG.getBitcast(VT: MemVT, V: NewArg); |
3022 | NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg, |
3023 | Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]); |
3024 | NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL); |
3025 | } |
3026 | } else { |
3027 | NewArg = |
3028 | lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset, |
3029 | Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]); |
3030 | } |
3031 | Chains.push_back(Elt: NewArg.getValue(R: 1)); |
3032 | |
3033 | auto *ParamTy = |
3034 | dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex())); |
3035 | if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && |
3036 | ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || |
3037 | ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { |
3038 | // On SI local pointers are just offsets into LDS, so they are always |
3039 | // less than 16-bits. On CI and newer they could potentially be |
3040 | // real pointers, so we can't guarantee their size. |
3041 | NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg, |
3042 | N2: DAG.getValueType(MVT::i16)); |
3043 | } |
3044 | |
3045 | InVals.push_back(Elt: NewArg); |
3046 | continue; |
3047 | } |
3048 | if (!IsEntryFunc && VA.isMemLoc()) { |
3049 | SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg); |
3050 | InVals.push_back(Elt: Val); |
3051 | if (!Arg.Flags.isByVal()) |
3052 | Chains.push_back(Elt: Val.getValue(R: 1)); |
3053 | continue; |
3054 | } |
3055 | |
3056 | assert(VA.isRegLoc() && "Parameter must be in a register!" ); |
3057 | |
3058 | Register Reg = VA.getLocReg(); |
3059 | const TargetRegisterClass *RC = nullptr; |
3060 | if (AMDGPU::VGPR_32RegClass.contains(Reg)) |
3061 | RC = &AMDGPU::VGPR_32RegClass; |
3062 | else if (AMDGPU::SGPR_32RegClass.contains(Reg)) |
3063 | RC = &AMDGPU::SGPR_32RegClass; |
3064 | else |
3065 | llvm_unreachable("Unexpected register class in LowerFormalArguments!" ); |
3066 | EVT ValVT = VA.getValVT(); |
3067 | |
3068 | Reg = MF.addLiveIn(PReg: Reg, RC); |
3069 | SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT); |
3070 | |
3071 | if (Arg.Flags.isSRet()) { |
3072 | // The return object should be reasonably addressable. |
3073 | |
3074 | // FIXME: This helps when the return is a real sret. If it is a |
3075 | // automatically inserted sret (i.e. CanLowerReturn returns false), an |
3076 | // extra copy is inserted in SelectionDAGBuilder which obscures this. |
3077 | unsigned NumBits |
3078 | = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); |
3079 | Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val, |
3080 | N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits))); |
3081 | } |
3082 | |
3083 | // If this is an 8 or 16-bit value, it is really passed promoted |
3084 | // to 32 bits. Insert an assert[sz]ext to capture this, then |
3085 | // truncate to the right size. |
3086 | switch (VA.getLocInfo()) { |
3087 | case CCValAssign::Full: |
3088 | break; |
3089 | case CCValAssign::BCvt: |
3090 | Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ValVT, Operand: Val); |
3091 | break; |
3092 | case CCValAssign::SExt: |
3093 | Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT, N1: Val, |
3094 | N2: DAG.getValueType(ValVT)); |
3095 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val); |
3096 | break; |
3097 | case CCValAssign::ZExt: |
3098 | Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val, |
3099 | N2: DAG.getValueType(ValVT)); |
3100 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val); |
3101 | break; |
3102 | case CCValAssign::AExt: |
3103 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val); |
3104 | break; |
3105 | default: |
3106 | llvm_unreachable("Unknown loc info!" ); |
3107 | } |
3108 | |
3109 | InVals.push_back(Elt: Val); |
3110 | } |
3111 | |
3112 | // Start adding system SGPRs. |
3113 | if (IsEntryFunc) |
3114 | allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics); |
3115 | |
3116 | // DAG.getPass() returns nullptr when using new pass manager. |
3117 | // TODO: Use DAG.getMFAM() to access analysis result. |
3118 | if (DAG.getPass()) { |
3119 | auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); |
3120 | ArgUsageInfo.setFuncArgInfo(F: Fn, ArgInfo: Info->getArgInfo()); |
3121 | } |
3122 | |
3123 | unsigned StackArgSize = CCInfo.getStackSize(); |
3124 | Info->setBytesInStackArgArea(StackArgSize); |
3125 | |
3126 | return Chains.empty() ? Chain : |
3127 | DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains); |
3128 | } |
3129 | |
3130 | // TODO: If return values can't fit in registers, we should return as many as |
3131 | // possible in registers before passing on stack. |
3132 | bool SITargetLowering::CanLowerReturn( |
3133 | CallingConv::ID CallConv, |
3134 | MachineFunction &MF, bool IsVarArg, |
3135 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
3136 | LLVMContext &Context) const { |
3137 | // Replacing returns with sret/stack usage doesn't make sense for shaders. |
3138 | // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn |
3139 | // for shaders. Vector types should be explicitly handled by CC. |
3140 | if (AMDGPU::isEntryFunctionCC(CC: CallConv)) |
3141 | return true; |
3142 | |
3143 | SmallVector<CCValAssign, 16> RVLocs; |
3144 | CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); |
3145 | if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg))) |
3146 | return false; |
3147 | |
3148 | // We must use the stack if return would require unavailable registers. |
3149 | unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF); |
3150 | unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); |
3151 | for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) |
3152 | if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i))) |
3153 | return false; |
3154 | |
3155 | return true; |
3156 | } |
3157 | |
3158 | SDValue |
3159 | SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
3160 | bool isVarArg, |
3161 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
3162 | const SmallVectorImpl<SDValue> &OutVals, |
3163 | const SDLoc &DL, SelectionDAG &DAG) const { |
3164 | MachineFunction &MF = DAG.getMachineFunction(); |
3165 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
3166 | |
3167 | if (AMDGPU::isKernel(CC: CallConv)) { |
3168 | return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, |
3169 | OutVals, DL, DAG); |
3170 | } |
3171 | |
3172 | bool IsShader = AMDGPU::isShader(CC: CallConv); |
3173 | |
3174 | Info->setIfReturnsVoid(Outs.empty()); |
3175 | bool IsWaveEnd = Info->returnsVoid() && IsShader; |
3176 | |
3177 | // CCValAssign - represent the assignment of the return value to a location. |
3178 | SmallVector<CCValAssign, 48> RVLocs; |
3179 | SmallVector<ISD::OutputArg, 48> Splits; |
3180 | |
3181 | // CCState - Info about the registers and stack slots. |
3182 | CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, |
3183 | *DAG.getContext()); |
3184 | |
3185 | // Analyze outgoing return values. |
3186 | CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg)); |
3187 | |
3188 | SDValue Glue; |
3189 | SmallVector<SDValue, 48> RetOps; |
3190 | RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below) |
3191 | |
3192 | // Copy the result values into the output registers. |
3193 | for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; |
3194 | ++I, ++RealRVLocIdx) { |
3195 | CCValAssign &VA = RVLocs[I]; |
3196 | assert(VA.isRegLoc() && "Can only return in registers!" ); |
3197 | // TODO: Partially return in registers if return values don't fit. |
3198 | SDValue Arg = OutVals[RealRVLocIdx]; |
3199 | |
3200 | // Copied from other backends. |
3201 | switch (VA.getLocInfo()) { |
3202 | case CCValAssign::Full: |
3203 | break; |
3204 | case CCValAssign::BCvt: |
3205 | Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg); |
3206 | break; |
3207 | case CCValAssign::SExt: |
3208 | Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3209 | break; |
3210 | case CCValAssign::ZExt: |
3211 | Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3212 | break; |
3213 | case CCValAssign::AExt: |
3214 | Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3215 | break; |
3216 | default: |
3217 | llvm_unreachable("Unknown loc info!" ); |
3218 | } |
3219 | |
3220 | Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue); |
3221 | Glue = Chain.getValue(R: 1); |
3222 | RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT())); |
3223 | } |
3224 | |
3225 | // FIXME: Does sret work properly? |
3226 | if (!Info->isEntryFunction()) { |
3227 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
3228 | const MCPhysReg *I = |
3229 | TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction()); |
3230 | if (I) { |
3231 | for (; *I; ++I) { |
3232 | if (AMDGPU::SReg_64RegClass.contains(Reg: *I)) |
3233 | RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64)); |
3234 | else if (AMDGPU::SReg_32RegClass.contains(Reg: *I)) |
3235 | RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32)); |
3236 | else |
3237 | llvm_unreachable("Unexpected register class in CSRsViaCopy!" ); |
3238 | } |
3239 | } |
3240 | } |
3241 | |
3242 | // Update chain and glue. |
3243 | RetOps[0] = Chain; |
3244 | if (Glue.getNode()) |
3245 | RetOps.push_back(Elt: Glue); |
3246 | |
3247 | unsigned Opc = AMDGPUISD::ENDPGM; |
3248 | if (!IsWaveEnd) |
3249 | Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE; |
3250 | return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps); |
3251 | } |
3252 | |
3253 | SDValue SITargetLowering::LowerCallResult( |
3254 | SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg, |
3255 | const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, |
3256 | SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn, |
3257 | SDValue ThisVal) const { |
3258 | CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg); |
3259 | |
3260 | // Assign locations to each value returned by this call. |
3261 | SmallVector<CCValAssign, 16> RVLocs; |
3262 | CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, |
3263 | *DAG.getContext()); |
3264 | CCInfo.AnalyzeCallResult(Ins, Fn: RetCC); |
3265 | |
3266 | // Copy all of the result registers out of their specified physreg. |
3267 | for (CCValAssign VA : RVLocs) { |
3268 | SDValue Val; |
3269 | |
3270 | if (VA.isRegLoc()) { |
3271 | Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue); |
3272 | Chain = Val.getValue(R: 1); |
3273 | InGlue = Val.getValue(R: 2); |
3274 | } else if (VA.isMemLoc()) { |
3275 | report_fatal_error(reason: "TODO: return values in memory" ); |
3276 | } else |
3277 | llvm_unreachable("unknown argument location type" ); |
3278 | |
3279 | switch (VA.getLocInfo()) { |
3280 | case CCValAssign::Full: |
3281 | break; |
3282 | case CCValAssign::BCvt: |
3283 | Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val); |
3284 | break; |
3285 | case CCValAssign::ZExt: |
3286 | Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val, |
3287 | N2: DAG.getValueType(VA.getValVT())); |
3288 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val); |
3289 | break; |
3290 | case CCValAssign::SExt: |
3291 | Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val, |
3292 | N2: DAG.getValueType(VA.getValVT())); |
3293 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val); |
3294 | break; |
3295 | case CCValAssign::AExt: |
3296 | Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val); |
3297 | break; |
3298 | default: |
3299 | llvm_unreachable("Unknown loc info!" ); |
3300 | } |
3301 | |
3302 | InVals.push_back(Elt: Val); |
3303 | } |
3304 | |
3305 | return Chain; |
3306 | } |
3307 | |
3308 | // Add code to pass special inputs required depending on used features separate |
3309 | // from the explicit user arguments present in the IR. |
3310 | void SITargetLowering::passSpecialInputs( |
3311 | CallLoweringInfo &CLI, |
3312 | CCState &CCInfo, |
3313 | const SIMachineFunctionInfo &Info, |
3314 | SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, |
3315 | SmallVectorImpl<SDValue> &MemOpChains, |
3316 | SDValue Chain) const { |
3317 | // If we don't have a call site, this was a call inserted by |
3318 | // legalization. These can never use special inputs. |
3319 | if (!CLI.CB) |
3320 | return; |
3321 | |
3322 | SelectionDAG &DAG = CLI.DAG; |
3323 | const SDLoc &DL = CLI.DL; |
3324 | const Function &F = DAG.getMachineFunction().getFunction(); |
3325 | |
3326 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
3327 | const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); |
3328 | |
3329 | const AMDGPUFunctionArgInfo *CalleeArgInfo |
3330 | = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; |
3331 | if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { |
3332 | // DAG.getPass() returns nullptr when using new pass manager. |
3333 | // TODO: Use DAG.getMFAM() to access analysis result. |
3334 | if (DAG.getPass()) { |
3335 | auto &ArgUsageInfo = |
3336 | DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); |
3337 | CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(F: *CalleeFunc); |
3338 | } |
3339 | } |
3340 | |
3341 | // TODO: Unify with private memory register handling. This is complicated by |
3342 | // the fact that at least in kernels, the input argument is not necessarily |
3343 | // in the same location as the input. |
3344 | static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue, |
3345 | StringLiteral> ImplicitAttrs[] = { |
3346 | {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr" }, |
3347 | {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" }, |
3348 | {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr" }, |
3349 | {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id" }, |
3350 | {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x" }, |
3351 | {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y" }, |
3352 | {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z" }, |
3353 | {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id" }, |
3354 | }; |
3355 | |
3356 | for (auto Attr : ImplicitAttrs) { |
3357 | const ArgDescriptor *OutgoingArg; |
3358 | const TargetRegisterClass *ArgRC; |
3359 | LLT ArgTy; |
3360 | |
3361 | AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first; |
3362 | |
3363 | // If the callee does not use the attribute value, skip copying the value. |
3364 | if (CLI.CB->hasFnAttr(Kind: Attr.second)) |
3365 | continue; |
3366 | |
3367 | std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) = |
3368 | CalleeArgInfo->getPreloadedValue(Value: InputID); |
3369 | if (!OutgoingArg) |
3370 | continue; |
3371 | |
3372 | const ArgDescriptor *IncomingArg; |
3373 | const TargetRegisterClass *IncomingArgRC; |
3374 | LLT Ty; |
3375 | std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: Ty) = |
3376 | CallerArgInfo.getPreloadedValue(Value: InputID); |
3377 | assert(IncomingArgRC == ArgRC); |
3378 | |
3379 | // All special arguments are ints for now. |
3380 | EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == 8 ? MVT::i64 : MVT::i32; |
3381 | SDValue InputReg; |
3382 | |
3383 | if (IncomingArg) { |
3384 | InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg); |
3385 | } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { |
3386 | // The implicit arg ptr is special because it doesn't have a corresponding |
3387 | // input for kernels, and is computed from the kernarg segment pointer. |
3388 | InputReg = getImplicitArgPtr(DAG, SL: DL); |
3389 | } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) { |
3390 | std::optional<uint32_t> Id = |
3391 | AMDGPUMachineFunction::getLDSKernelIdMetadata(F); |
3392 | if (Id.has_value()) { |
3393 | InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT); |
3394 | } else { |
3395 | InputReg = DAG.getUNDEF(VT: ArgVT); |
3396 | } |
3397 | } else { |
3398 | // We may have proven the input wasn't needed, although the ABI is |
3399 | // requiring it. We just need to allocate the register appropriately. |
3400 | InputReg = DAG.getUNDEF(VT: ArgVT); |
3401 | } |
3402 | |
3403 | if (OutgoingArg->isRegister()) { |
3404 | RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg); |
3405 | if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister())) |
3406 | report_fatal_error(reason: "failed to allocate implicit input argument" ); |
3407 | } else { |
3408 | unsigned SpecialArgOffset = |
3409 | CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4)); |
3410 | SDValue ArgStore = storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, |
3411 | Offset: SpecialArgOffset); |
3412 | MemOpChains.push_back(Elt: ArgStore); |
3413 | } |
3414 | } |
3415 | |
3416 | // Pack workitem IDs into a single register or pass it as is if already |
3417 | // packed. |
3418 | const ArgDescriptor *OutgoingArg; |
3419 | const TargetRegisterClass *ArgRC; |
3420 | LLT Ty; |
3421 | |
3422 | std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) = |
3423 | CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X); |
3424 | if (!OutgoingArg) |
3425 | std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) = |
3426 | CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y); |
3427 | if (!OutgoingArg) |
3428 | std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) = |
3429 | CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z); |
3430 | if (!OutgoingArg) |
3431 | return; |
3432 | |
3433 | const ArgDescriptor *IncomingArgX = std::get<0>( |
3434 | t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X)); |
3435 | const ArgDescriptor *IncomingArgY = std::get<0>( |
3436 | t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); |
3437 | const ArgDescriptor *IncomingArgZ = std::get<0>( |
3438 | t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); |
3439 | |
3440 | SDValue InputReg; |
3441 | SDLoc SL; |
3442 | |
3443 | const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x" ); |
3444 | const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y" ); |
3445 | const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z" ); |
3446 | |
3447 | // If incoming ids are not packed we need to pack them. |
3448 | if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && |
3449 | NeedWorkItemIDX) { |
3450 | if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 0) != 0) { |
3451 | InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX); |
3452 | } else { |
3453 | InputReg = DAG.getConstant(Val: 0, DL, VT: MVT::i32); |
3454 | } |
3455 | } |
3456 | |
3457 | if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && |
3458 | NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 1) != 0) { |
3459 | SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY); |
3460 | Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y, |
3461 | N2: DAG.getShiftAmountConstant(Val: 10, VT: MVT::i32, DL: SL)); |
3462 | InputReg = InputReg.getNode() ? |
3463 | DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y) : Y; |
3464 | } |
3465 | |
3466 | if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && |
3467 | NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 2) != 0) { |
3468 | SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ); |
3469 | Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z, |
3470 | N2: DAG.getShiftAmountConstant(Val: 20, VT: MVT::i32, DL: SL)); |
3471 | InputReg = InputReg.getNode() ? |
3472 | DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z) : Z; |
3473 | } |
3474 | |
3475 | if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { |
3476 | if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) { |
3477 | // We're in a situation where the outgoing function requires the workitem |
3478 | // ID, but the calling function does not have it (e.g a graphics function |
3479 | // calling a C calling convention function). This is illegal, but we need |
3480 | // to produce something. |
3481 | InputReg = DAG.getUNDEF(VT: MVT::i32); |
3482 | } else { |
3483 | // Workitem ids are already packed, any of present incoming arguments |
3484 | // will carry all required fields. |
3485 | ArgDescriptor IncomingArg = ArgDescriptor::createArg( |
3486 | Arg: IncomingArgX ? *IncomingArgX : |
3487 | IncomingArgY ? *IncomingArgY : |
3488 | *IncomingArgZ, Mask: ~0u); |
3489 | InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg); |
3490 | } |
3491 | } |
3492 | |
3493 | if (OutgoingArg->isRegister()) { |
3494 | if (InputReg) |
3495 | RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg); |
3496 | |
3497 | CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()); |
3498 | } else { |
3499 | unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4)); |
3500 | if (InputReg) { |
3501 | SDValue ArgStore = storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, |
3502 | Offset: SpecialArgOffset); |
3503 | MemOpChains.push_back(Elt: ArgStore); |
3504 | } |
3505 | } |
3506 | } |
3507 | |
3508 | static bool canGuaranteeTCO(CallingConv::ID CC) { |
3509 | return CC == CallingConv::Fast; |
3510 | } |
3511 | |
3512 | /// Return true if we might ever do TCO for calls with this calling convention. |
3513 | static bool mayTailCallThisCC(CallingConv::ID CC) { |
3514 | switch (CC) { |
3515 | case CallingConv::C: |
3516 | case CallingConv::AMDGPU_Gfx: |
3517 | return true; |
3518 | default: |
3519 | return canGuaranteeTCO(CC); |
3520 | } |
3521 | } |
3522 | |
3523 | bool SITargetLowering::isEligibleForTailCallOptimization( |
3524 | SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, |
3525 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
3526 | const SmallVectorImpl<SDValue> &OutVals, |
3527 | const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { |
3528 | if (AMDGPU::isChainCC(CC: CalleeCC)) |
3529 | return true; |
3530 | |
3531 | if (!mayTailCallThisCC(CC: CalleeCC)) |
3532 | return false; |
3533 | |
3534 | // For a divergent call target, we need to do a waterfall loop over the |
3535 | // possible callees which precludes us from using a simple jump. |
3536 | if (Callee->isDivergent()) |
3537 | return false; |
3538 | |
3539 | MachineFunction &MF = DAG.getMachineFunction(); |
3540 | const Function &CallerF = MF.getFunction(); |
3541 | CallingConv::ID CallerCC = CallerF.getCallingConv(); |
3542 | const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
3543 | const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); |
3544 | |
3545 | // Kernels aren't callable, and don't have a live in return address so it |
3546 | // doesn't make sense to do a tail call with entry functions. |
3547 | if (!CallerPreserved) |
3548 | return false; |
3549 | |
3550 | bool CCMatch = CallerCC == CalleeCC; |
3551 | |
3552 | if (DAG.getTarget().Options.GuaranteedTailCallOpt) { |
3553 | if (canGuaranteeTCO(CC: CalleeCC) && CCMatch) |
3554 | return true; |
3555 | return false; |
3556 | } |
3557 | |
3558 | // TODO: Can we handle var args? |
3559 | if (IsVarArg) |
3560 | return false; |
3561 | |
3562 | for (const Argument &Arg : CallerF.args()) { |
3563 | if (Arg.hasByValAttr()) |
3564 | return false; |
3565 | } |
3566 | |
3567 | LLVMContext &Ctx = *DAG.getContext(); |
3568 | |
3569 | // Check that the call results are passed in the same way. |
3570 | if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins, |
3571 | CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg), |
3572 | CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg))) |
3573 | return false; |
3574 | |
3575 | // The callee has to preserve all registers the caller needs to preserve. |
3576 | if (!CCMatch) { |
3577 | const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); |
3578 | if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved)) |
3579 | return false; |
3580 | } |
3581 | |
3582 | // Nothing more to check if the callee is taking no arguments. |
3583 | if (Outs.empty()) |
3584 | return true; |
3585 | |
3586 | SmallVector<CCValAssign, 16> ArgLocs; |
3587 | CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); |
3588 | |
3589 | CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg)); |
3590 | |
3591 | const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
3592 | // If the stack arguments for this call do not fit into our own save area then |
3593 | // the call cannot be made tail. |
3594 | // TODO: Is this really necessary? |
3595 | if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) |
3596 | return false; |
3597 | |
3598 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
3599 | return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals); |
3600 | } |
3601 | |
3602 | bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { |
3603 | if (!CI->isTailCall()) |
3604 | return false; |
3605 | |
3606 | const Function *ParentFn = CI->getParent()->getParent(); |
3607 | if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv())) |
3608 | return false; |
3609 | return true; |
3610 | } |
3611 | |
3612 | // The wave scratch offset register is used as the global base pointer. |
3613 | SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, |
3614 | SmallVectorImpl<SDValue> &InVals) const { |
3615 | CallingConv::ID CallConv = CLI.CallConv; |
3616 | bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv); |
3617 | |
3618 | SelectionDAG &DAG = CLI.DAG; |
3619 | |
3620 | TargetLowering::ArgListEntry RequestedExec; |
3621 | if (IsChainCallConv) { |
3622 | // The last argument should be the value that we need to put in EXEC. |
3623 | // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we |
3624 | // don't treat it like the rest of the arguments. |
3625 | RequestedExec = CLI.Args.back(); |
3626 | assert(RequestedExec.Node && "No node for EXEC" ); |
3627 | |
3628 | if (!RequestedExec.Ty->isIntegerTy(Bitwidth: Subtarget->getWavefrontSize())) |
3629 | return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC" ); |
3630 | |
3631 | assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg" ); |
3632 | CLI.Outs.pop_back(); |
3633 | CLI.OutVals.pop_back(); |
3634 | |
3635 | if (RequestedExec.Ty->isIntegerTy(Bitwidth: 64)) { |
3636 | assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up" ); |
3637 | CLI.Outs.pop_back(); |
3638 | CLI.OutVals.pop_back(); |
3639 | } |
3640 | |
3641 | assert(CLI.Outs.back().OrigArgIndex != 2 && |
3642 | "Haven't popped all the pieces of the EXEC mask" ); |
3643 | } |
3644 | |
3645 | const SDLoc &DL = CLI.DL; |
3646 | SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; |
3647 | SmallVector<SDValue, 32> &OutVals = CLI.OutVals; |
3648 | SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; |
3649 | SDValue Chain = CLI.Chain; |
3650 | SDValue Callee = CLI.Callee; |
3651 | bool &IsTailCall = CLI.IsTailCall; |
3652 | bool IsVarArg = CLI.IsVarArg; |
3653 | bool IsSibCall = false; |
3654 | MachineFunction &MF = DAG.getMachineFunction(); |
3655 | |
3656 | if (Callee.isUndef() || isNullConstant(V: Callee)) { |
3657 | if (!CLI.IsTailCall) { |
3658 | for (ISD::InputArg &Arg : CLI.Ins) |
3659 | InVals.push_back(Elt: DAG.getUNDEF(VT: Arg.VT)); |
3660 | } |
3661 | |
3662 | return Chain; |
3663 | } |
3664 | |
3665 | if (IsVarArg) { |
3666 | return lowerUnhandledCall(CLI, InVals, |
3667 | Reason: "unsupported call to variadic function " ); |
3668 | } |
3669 | |
3670 | if (!CLI.CB) |
3671 | report_fatal_error(reason: "unsupported libcall legalization" ); |
3672 | |
3673 | if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { |
3674 | return lowerUnhandledCall(CLI, InVals, |
3675 | Reason: "unsupported required tail call to function " ); |
3676 | } |
3677 | |
3678 | if (IsTailCall) { |
3679 | IsTailCall = isEligibleForTailCallOptimization( |
3680 | Callee, CalleeCC: CallConv, IsVarArg, Outs, OutVals, Ins, DAG); |
3681 | if (!IsTailCall && |
3682 | ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) { |
3683 | report_fatal_error(reason: "failed to perform tail call elimination on a call " |
3684 | "site marked musttail or on llvm.amdgcn.cs.chain" ); |
3685 | } |
3686 | |
3687 | bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; |
3688 | |
3689 | // A sibling call is one where we're under the usual C ABI and not planning |
3690 | // to change that but can still do a tail call: |
3691 | if (!TailCallOpt && IsTailCall) |
3692 | IsSibCall = true; |
3693 | |
3694 | if (IsTailCall) |
3695 | ++NumTailCalls; |
3696 | } |
3697 | |
3698 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
3699 | SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; |
3700 | SmallVector<SDValue, 8> MemOpChains; |
3701 | |
3702 | // Analyze operands of the call, assigning locations to each operand. |
3703 | SmallVector<CCValAssign, 16> ArgLocs; |
3704 | CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); |
3705 | CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg); |
3706 | |
3707 | if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv)) { |
3708 | // With a fixed ABI, allocate fixed registers before user arguments. |
3709 | passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain); |
3710 | } |
3711 | |
3712 | CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn); |
3713 | |
3714 | // Get a count of how many bytes are to be pushed on the stack. |
3715 | unsigned NumBytes = CCInfo.getStackSize(); |
3716 | |
3717 | if (IsSibCall) { |
3718 | // Since we're not changing the ABI to make this a tail call, the memory |
3719 | // operands are already available in the caller's incoming argument space. |
3720 | NumBytes = 0; |
3721 | } |
3722 | |
3723 | // FPDiff is the byte offset of the call's argument area from the callee's. |
3724 | // Stores to callee stack arguments will be placed in FixedStackSlots offset |
3725 | // by this amount for a tail call. In a sibling call it must be 0 because the |
3726 | // caller will deallocate the entire stack and the callee still expects its |
3727 | // arguments to begin at SP+0. Completely unused for non-tail calls. |
3728 | int32_t FPDiff = 0; |
3729 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
3730 | |
3731 | // Adjust the stack pointer for the new arguments... |
3732 | // These operations are automatically eliminated by the prolog/epilog pass |
3733 | if (!IsSibCall) |
3734 | Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL); |
3735 | |
3736 | if (!IsSibCall || IsChainCallConv) { |
3737 | if (!Subtarget->enableFlatScratch()) { |
3738 | SmallVector<SDValue, 4> CopyFromChains; |
3739 | |
3740 | // In the HSA case, this should be an identity copy. |
3741 | SDValue ScratchRSrcReg |
3742 | = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32); |
3743 | RegsToPass.emplace_back(Args: IsChainCallConv |
3744 | ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 |
3745 | : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, |
3746 | Args&: ScratchRSrcReg); |
3747 | CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1)); |
3748 | Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains); |
3749 | } |
3750 | } |
3751 | |
3752 | MVT PtrVT = MVT::i32; |
3753 | |
3754 | // Walk the register/memloc assignments, inserting copies/loads. |
3755 | for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { |
3756 | CCValAssign &VA = ArgLocs[i]; |
3757 | SDValue Arg = OutVals[i]; |
3758 | |
3759 | // Promote the value if needed. |
3760 | switch (VA.getLocInfo()) { |
3761 | case CCValAssign::Full: |
3762 | break; |
3763 | case CCValAssign::BCvt: |
3764 | Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg); |
3765 | break; |
3766 | case CCValAssign::ZExt: |
3767 | Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3768 | break; |
3769 | case CCValAssign::SExt: |
3770 | Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3771 | break; |
3772 | case CCValAssign::AExt: |
3773 | Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3774 | break; |
3775 | case CCValAssign::FPExt: |
3776 | Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg); |
3777 | break; |
3778 | default: |
3779 | llvm_unreachable("Unknown loc info!" ); |
3780 | } |
3781 | |
3782 | if (VA.isRegLoc()) { |
3783 | RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg)); |
3784 | } else { |
3785 | assert(VA.isMemLoc()); |
3786 | |
3787 | SDValue DstAddr; |
3788 | MachinePointerInfo DstInfo; |
3789 | |
3790 | unsigned LocMemOffset = VA.getLocMemOffset(); |
3791 | int32_t Offset = LocMemOffset; |
3792 | |
3793 | SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT); |
3794 | MaybeAlign Alignment; |
3795 | |
3796 | if (IsTailCall) { |
3797 | ISD::ArgFlagsTy Flags = Outs[i].Flags; |
3798 | unsigned OpSize = Flags.isByVal() ? |
3799 | Flags.getByValSize() : VA.getValVT().getStoreSize(); |
3800 | |
3801 | // FIXME: We can have better than the minimum byval required alignment. |
3802 | Alignment = |
3803 | Flags.isByVal() |
3804 | ? Flags.getNonZeroByValAlign() |
3805 | : commonAlignment(A: Subtarget->getStackAlignment(), Offset); |
3806 | |
3807 | Offset = Offset + FPDiff; |
3808 | int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true); |
3809 | |
3810 | DstAddr = DAG.getFrameIndex(FI, VT: PtrVT); |
3811 | DstInfo = MachinePointerInfo::getFixedStack(MF, FI); |
3812 | |
3813 | // Make sure any stack arguments overlapping with where we're storing |
3814 | // are loaded before this eventual operation. Otherwise they'll be |
3815 | // clobbered. |
3816 | |
3817 | // FIXME: Why is this really necessary? This seems to just result in a |
3818 | // lot of code to copy the stack and write them back to the same |
3819 | // locations, which are supposed to be immutable? |
3820 | Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI); |
3821 | } else { |
3822 | // Stores to the argument stack area are relative to the stack pointer. |
3823 | SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(), |
3824 | VT: MVT::i32); |
3825 | DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff); |
3826 | DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset); |
3827 | Alignment = |
3828 | commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset); |
3829 | } |
3830 | |
3831 | if (Outs[i].Flags.isByVal()) { |
3832 | SDValue SizeNode = |
3833 | DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i32); |
3834 | SDValue Cpy = |
3835 | DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode, |
3836 | Alignment: Outs[i].Flags.getNonZeroByValAlign(), |
3837 | /*isVol = */ false, /*AlwaysInline = */ true, |
3838 | /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo, |
3839 | SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); |
3840 | |
3841 | MemOpChains.push_back(Elt: Cpy); |
3842 | } else { |
3843 | SDValue Store = |
3844 | DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment); |
3845 | MemOpChains.push_back(Elt: Store); |
3846 | } |
3847 | } |
3848 | } |
3849 | |
3850 | if (!MemOpChains.empty()) |
3851 | Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains); |
3852 | |
3853 | // Build a sequence of copy-to-reg nodes chained together with token chain |
3854 | // and flag operands which copy the outgoing args into the appropriate regs. |
3855 | SDValue InGlue; |
3856 | for (auto &RegToPass : RegsToPass) { |
3857 | Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first, |
3858 | N: RegToPass.second, Glue: InGlue); |
3859 | InGlue = Chain.getValue(R: 1); |
3860 | } |
3861 | |
3862 | |
3863 | // We don't usually want to end the call-sequence here because we would tidy |
3864 | // the frame up *after* the call, however in the ABI-changing tail-call case |
3865 | // we've carefully laid out the parameters so that when sp is reset they'll be |
3866 | // in the correct location. |
3867 | if (IsTailCall && !IsSibCall) { |
3868 | Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL); |
3869 | InGlue = Chain.getValue(R: 1); |
3870 | } |
3871 | |
3872 | std::vector<SDValue> Ops; |
3873 | Ops.push_back(x: Chain); |
3874 | Ops.push_back(x: Callee); |
3875 | // Add a redundant copy of the callee global which will not be legalized, as |
3876 | // we need direct access to the callee later. |
3877 | if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) { |
3878 | const GlobalValue *GV = GSD->getGlobal(); |
3879 | Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64)); |
3880 | } else { |
3881 | Ops.push_back(x: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64)); |
3882 | } |
3883 | |
3884 | if (IsTailCall) { |
3885 | // Each tail call may have to adjust the stack by a different amount, so |
3886 | // this information must travel along with the operation for eventual |
3887 | // consumption by emitEpilogue. |
3888 | Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32)); |
3889 | } |
3890 | |
3891 | if (IsChainCallConv) |
3892 | Ops.push_back(x: RequestedExec.Node); |
3893 | |
3894 | // Add argument registers to the end of the list so that they are known live |
3895 | // into the call. |
3896 | for (auto &RegToPass : RegsToPass) { |
3897 | Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first, |
3898 | VT: RegToPass.second.getValueType())); |
3899 | } |
3900 | |
3901 | // Add a register mask operand representing the call-preserved registers. |
3902 | auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); |
3903 | const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); |
3904 | assert(Mask && "Missing call preserved mask for calling convention" ); |
3905 | Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask)); |
3906 | |
3907 | if (SDValue Token = CLI.ConvergenceControlToken) { |
3908 | SmallVector<SDValue, 2> GlueOps; |
3909 | GlueOps.push_back(Elt: Token); |
3910 | if (InGlue) |
3911 | GlueOps.push_back(Elt: InGlue); |
3912 | |
3913 | InGlue = SDValue(DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL, |
3914 | VT: MVT::Glue, Ops: GlueOps), |
3915 | 0); |
3916 | } |
3917 | |
3918 | if (InGlue) |
3919 | Ops.push_back(x: InGlue); |
3920 | |
3921 | SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
3922 | |
3923 | // If we're doing a tall call, use a TC_RETURN here rather than an |
3924 | // actual call instruction. |
3925 | if (IsTailCall) { |
3926 | MFI.setHasTailCall(); |
3927 | unsigned OPC = AMDGPUISD::TC_RETURN; |
3928 | switch (CallConv) { |
3929 | case CallingConv::AMDGPU_Gfx: |
3930 | OPC = AMDGPUISD::TC_RETURN_GFX; |
3931 | break; |
3932 | case CallingConv::AMDGPU_CS_Chain: |
3933 | case CallingConv::AMDGPU_CS_ChainPreserve: |
3934 | OPC = AMDGPUISD::TC_RETURN_CHAIN; |
3935 | break; |
3936 | } |
3937 | |
3938 | return DAG.getNode(Opcode: OPC, DL, VTList: NodeTys, Ops); |
3939 | } |
3940 | |
3941 | // Returns a chain and a flag for retval copy to use. |
3942 | SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, VTList: NodeTys, Ops); |
3943 | Chain = Call.getValue(R: 0); |
3944 | InGlue = Call.getValue(R: 1); |
3945 | |
3946 | uint64_t CalleePopBytes = NumBytes; |
3947 | Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL); |
3948 | if (!Ins.empty()) |
3949 | InGlue = Chain.getValue(R: 1); |
3950 | |
3951 | // Handle result values, copying them out of physregs into vregs that we |
3952 | // return. |
3953 | return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG, |
3954 | InVals, /*IsThisReturn=*/false, ThisVal: SDValue()); |
3955 | } |
3956 | |
3957 | // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC, |
3958 | // except for applying the wave size scale to the increment amount. |
3959 | SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl( |
3960 | SDValue Op, SelectionDAG &DAG) const { |
3961 | const MachineFunction &MF = DAG.getMachineFunction(); |
3962 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
3963 | |
3964 | SDLoc dl(Op); |
3965 | EVT VT = Op.getValueType(); |
3966 | SDValue Tmp1 = Op; |
3967 | SDValue Tmp2 = Op.getValue(R: 1); |
3968 | SDValue Tmp3 = Op.getOperand(i: 2); |
3969 | SDValue Chain = Tmp1.getOperand(i: 0); |
3970 | |
3971 | Register SPReg = Info->getStackPtrOffsetReg(); |
3972 | |
3973 | // Chain the dynamic stack allocation so that it doesn't modify the stack |
3974 | // pointer when other instructions are using the stack. |
3975 | Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl); |
3976 | |
3977 | SDValue Size = Tmp2.getOperand(i: 1); |
3978 | SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT); |
3979 | Chain = SP.getValue(R: 1); |
3980 | MaybeAlign Alignment = cast<ConstantSDNode>(Val&: Tmp3)->getMaybeAlignValue(); |
3981 | const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); |
3982 | unsigned Opc = |
3983 | TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ? |
3984 | ISD::ADD : ISD::SUB; |
3985 | |
3986 | SDValue ScaledSize = DAG.getNode( |
3987 | Opcode: ISD::SHL, DL: dl, VT, N1: Size, |
3988 | N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32)); |
3989 | |
3990 | Align StackAlign = TFL->getStackAlign(); |
3991 | Tmp1 = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: SP, N2: ScaledSize); // Value |
3992 | if (Alignment && *Alignment > StackAlign) { |
3993 | Tmp1 = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: Tmp1, |
3994 | N2: DAG.getConstant(Val: -(uint64_t)Alignment->value() |
3995 | << Subtarget->getWavefrontSizeLog2(), |
3996 | DL: dl, VT)); |
3997 | } |
3998 | |
3999 | Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: Tmp1); // Output chain |
4000 | Tmp2 = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl); |
4001 | |
4002 | return DAG.getMergeValues(Ops: {Tmp1, Tmp2}, dl); |
4003 | } |
4004 | |
4005 | SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, |
4006 | SelectionDAG &DAG) const { |
4007 | // We only handle constant sizes here to allow non-entry block, static sized |
4008 | // allocas. A truly dynamic value is more difficult to support because we |
4009 | // don't know if the size value is uniform or not. If the size isn't uniform, |
4010 | // we would need to do a wave reduction to get the maximum size to know how |
4011 | // much to increment the uniform stack pointer. |
4012 | SDValue Size = Op.getOperand(i: 1); |
4013 | if (isa<ConstantSDNode>(Val: Size)) |
4014 | return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. |
4015 | |
4016 | return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); |
4017 | } |
4018 | |
4019 | SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { |
4020 | if (Op.getValueType() != MVT::i32) |
4021 | return Op; // Defer to cannot select error. |
4022 | |
4023 | Register SP = getStackPointerRegisterToSaveRestore(); |
4024 | SDLoc SL(Op); |
4025 | |
4026 | SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op->getOperand(Num: 0), dl: SL, Reg: SP, VT: MVT::i32); |
4027 | |
4028 | // Convert from wave uniform to swizzled vector address. This should protect |
4029 | // from any edge cases where the stacksave result isn't directly used with |
4030 | // stackrestore. |
4031 | SDValue VectorAddress = |
4032 | DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP); |
4033 | return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL); |
4034 | } |
4035 | |
4036 | SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, |
4037 | SelectionDAG &DAG) const { |
4038 | SDLoc SL(Op); |
4039 | assert(Op.getValueType() == MVT::i32); |
4040 | |
4041 | uint32_t BothRoundHwReg = |
4042 | AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4); |
4043 | SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32); |
4044 | |
4045 | SDValue IntrinID = |
4046 | DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32); |
4047 | SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(), |
4048 | N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm); |
4049 | |
4050 | // There are two rounding modes, one for f32 and one for f64/f16. We only |
4051 | // report in the standard value range if both are the same. |
4052 | // |
4053 | // The raw values also differ from the expected FLT_ROUNDS values. Nearest |
4054 | // ties away from zero is not supported, and the other values are rotated by |
4055 | // 1. |
4056 | // |
4057 | // If the two rounding modes are not the same, report a target defined value. |
4058 | |
4059 | // Mode register rounding mode fields: |
4060 | // |
4061 | // [1:0] Single-precision round mode. |
4062 | // [3:2] Double/Half-precision round mode. |
4063 | // |
4064 | // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. |
4065 | // |
4066 | // Hardware Spec |
4067 | // Toward-0 3 0 |
4068 | // Nearest Even 0 1 |
4069 | // +Inf 1 2 |
4070 | // -Inf 2 3 |
4071 | // NearestAway0 N/A 4 |
4072 | // |
4073 | // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit |
4074 | // table we can index by the raw hardware mode. |
4075 | // |
4076 | // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf |
4077 | |
4078 | SDValue BitTable = |
4079 | DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64); |
4080 | |
4081 | SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32); |
4082 | SDValue RoundModeTimesNumBits = |
4083 | DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two); |
4084 | |
4085 | // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we |
4086 | // knew only one mode was demanded. |
4087 | SDValue TableValue = |
4088 | DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits); |
4089 | SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue); |
4090 | |
4091 | SDValue EntryMask = DAG.getConstant(Val: 0xf, DL: SL, VT: MVT::i32); |
4092 | SDValue TableEntry = |
4093 | DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask); |
4094 | |
4095 | // There's a gap in the 4-bit encoded table and actual enum values, so offset |
4096 | // if it's an extended value. |
4097 | SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32); |
4098 | SDValue IsStandardValue = |
4099 | DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT); |
4100 | SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four); |
4101 | SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue, |
4102 | N2: TableEntry, N3: EnumOffset); |
4103 | |
4104 | return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL); |
4105 | } |
4106 | |
4107 | SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op, |
4108 | SelectionDAG &DAG) const { |
4109 | SDLoc SL(Op); |
4110 | |
4111 | SDValue NewMode = Op.getOperand(i: 1); |
4112 | assert(NewMode.getValueType() == MVT::i32); |
4113 | |
4114 | // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the |
4115 | // hardware MODE.fp_round values. |
4116 | if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) { |
4117 | uint32_t ClampedVal = std::min( |
4118 | a: static_cast<uint32_t>(ConstMode->getZExtValue()), |
4119 | b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64)); |
4120 | NewMode = DAG.getConstant( |
4121 | Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32); |
4122 | } else { |
4123 | // If we know the input can only be one of the supported standard modes in |
4124 | // the range 0-3, we can use a simplified mapping to hardware values. |
4125 | KnownBits KB = DAG.computeKnownBits(Op: NewMode); |
4126 | const bool UseReducedTable = KB.countMinLeadingZeros() >= 30; |
4127 | // The supported standard values are 0-3. The extended values start at 8. We |
4128 | // need to offset by 4 if the value is in the extended range. |
4129 | |
4130 | if (UseReducedTable) { |
4131 | // Truncate to the low 32-bits. |
4132 | SDValue BitTable = DAG.getConstant( |
4133 | Val: AMDGPU::FltRoundToHWConversionTable & 0xffff, DL: SL, VT: MVT::i32); |
4134 | |
4135 | SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32); |
4136 | SDValue RoundModeTimesNumBits = |
4137 | DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two); |
4138 | |
4139 | NewMode = |
4140 | DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits); |
4141 | |
4142 | // TODO: SimplifyDemandedBits on the setreg source here can likely reduce |
4143 | // the table extracted bits into inline immediates. |
4144 | } else { |
4145 | // table_index = umin(value, value - 4) |
4146 | // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf |
4147 | SDValue BitTable = |
4148 | DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64); |
4149 | |
4150 | SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32); |
4151 | SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four); |
4152 | SDValue IndexVal = |
4153 | DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum); |
4154 | |
4155 | SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32); |
4156 | SDValue RoundModeTimesNumBits = |
4157 | DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two); |
4158 | |
4159 | SDValue TableValue = |
4160 | DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits); |
4161 | SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue); |
4162 | |
4163 | // No need to mask out the high bits since the setreg will ignore them |
4164 | // anyway. |
4165 | NewMode = TruncTable; |
4166 | } |
4167 | |
4168 | // Insert a readfirstlane in case the value is a VGPR. We could do this |
4169 | // earlier and keep more operations scalar, but that interferes with |
4170 | // combining the source. |
4171 | SDValue ReadFirstLaneID = |
4172 | DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32); |
4173 | NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, |
4174 | N1: ReadFirstLaneID, N2: NewMode); |
4175 | } |
4176 | |
4177 | // N.B. The setreg will be later folded into s_round_mode on supported |
4178 | // targets. |
4179 | SDValue IntrinID = |
4180 | DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32); |
4181 | uint32_t BothRoundHwReg = |
4182 | AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4); |
4183 | SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32); |
4184 | |
4185 | SDValue SetReg = |
4186 | DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op->getVTList(), N1: Op.getOperand(i: 0), |
4187 | N2: IntrinID, N3: RoundBothImm, N4: NewMode); |
4188 | |
4189 | return SetReg; |
4190 | } |
4191 | |
4192 | SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { |
4193 | if (Op->isDivergent()) |
4194 | return SDValue(); |
4195 | |
4196 | switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) { |
4197 | case AMDGPUAS::FLAT_ADDRESS: |
4198 | case AMDGPUAS::GLOBAL_ADDRESS: |
4199 | case AMDGPUAS::CONSTANT_ADDRESS: |
4200 | case AMDGPUAS::CONSTANT_ADDRESS_32BIT: |
4201 | break; |
4202 | default: |
4203 | return SDValue(); |
4204 | } |
4205 | |
4206 | return Op; |
4207 | } |
4208 | |
4209 | // Work around DAG legality rules only based on the result type. |
4210 | SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { |
4211 | bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND; |
4212 | SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0); |
4213 | EVT SrcVT = Src.getValueType(); |
4214 | |
4215 | if (SrcVT.getScalarType() != MVT::bf16) |
4216 | return Op; |
4217 | |
4218 | SDLoc SL(Op); |
4219 | SDValue BitCast = |
4220 | DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src); |
4221 | |
4222 | EVT DstVT = Op.getValueType(); |
4223 | if (IsStrict) |
4224 | llvm_unreachable("Need STRICT_BF16_TO_FP" ); |
4225 | |
4226 | return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast); |
4227 | } |
4228 | |
4229 | SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const { |
4230 | SDLoc SL(Op); |
4231 | if (Op.getValueType() != MVT::i64) |
4232 | return Op; |
4233 | |
4234 | uint32_t ModeHwReg = |
4235 | AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23); |
4236 | SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32); |
4237 | uint32_t TrapHwReg = |
4238 | AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5); |
4239 | SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32); |
4240 | |
4241 | SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other); |
4242 | SDValue IntrinID = |
4243 | DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32); |
4244 | SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList, |
4245 | N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm); |
4246 | SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList, |
4247 | N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm); |
4248 | SDValue TokenReg = |
4249 | DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: 1), |
4250 | N2: GetTrapReg.getValue(R: 1)); |
4251 | |
4252 | SDValue CvtPtr = |
4253 | DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg); |
4254 | SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr); |
4255 | |
4256 | return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL); |
4257 | } |
4258 | |
4259 | SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const { |
4260 | SDLoc SL(Op); |
4261 | if (Op.getOperand(i: 1).getValueType() != MVT::i64) |
4262 | return Op; |
4263 | |
4264 | SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1)); |
4265 | SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input, |
4266 | N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)); |
4267 | SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input, |
4268 | N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)); |
4269 | |
4270 | SDValue ReadFirstLaneID = |
4271 | DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32); |
4272 | NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, |
4273 | N1: ReadFirstLaneID, N2: NewModeReg); |
4274 | NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, |
4275 | N1: ReadFirstLaneID, N2: NewTrapReg); |
4276 | |
4277 | unsigned ModeHwReg = |
4278 | AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23); |
4279 | SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32); |
4280 | unsigned TrapHwReg = |
4281 | AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5); |
4282 | SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32); |
4283 | |
4284 | SDValue IntrinID = |
4285 | DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32); |
4286 | SDValue SetModeReg = |
4287 | DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0), |
4288 | N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg); |
4289 | SDValue SetTrapReg = |
4290 | DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0), |
4291 | N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg); |
4292 | return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg); |
4293 | } |
4294 | |
4295 | Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, |
4296 | const MachineFunction &MF) const { |
4297 | Register Reg = StringSwitch<Register>(RegName) |
4298 | .Case(S: "m0" , Value: AMDGPU::M0) |
4299 | .Case(S: "exec" , Value: AMDGPU::EXEC) |
4300 | .Case(S: "exec_lo" , Value: AMDGPU::EXEC_LO) |
4301 | .Case(S: "exec_hi" , Value: AMDGPU::EXEC_HI) |
4302 | .Case(S: "flat_scratch" , Value: AMDGPU::FLAT_SCR) |
4303 | .Case(S: "flat_scratch_lo" , Value: AMDGPU::FLAT_SCR_LO) |
4304 | .Case(S: "flat_scratch_hi" , Value: AMDGPU::FLAT_SCR_HI) |
4305 | .Default(Value: Register()); |
4306 | |
4307 | if (Reg == AMDGPU::NoRegister) { |
4308 | report_fatal_error(reason: Twine("invalid register name \"" |
4309 | + StringRef(RegName) + "\"." )); |
4310 | |
4311 | } |
4312 | |
4313 | if (!Subtarget->hasFlatScrRegister() && |
4314 | Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) { |
4315 | report_fatal_error(reason: Twine("invalid register \"" |
4316 | + StringRef(RegName) + "\" for subtarget." )); |
4317 | } |
4318 | |
4319 | switch (Reg) { |
4320 | case AMDGPU::M0: |
4321 | case AMDGPU::EXEC_LO: |
4322 | case AMDGPU::EXEC_HI: |
4323 | case AMDGPU::FLAT_SCR_LO: |
4324 | case AMDGPU::FLAT_SCR_HI: |
4325 | if (VT.getSizeInBits() == 32) |
4326 | return Reg; |
4327 | break; |
4328 | case AMDGPU::EXEC: |
4329 | case AMDGPU::FLAT_SCR: |
4330 | if (VT.getSizeInBits() == 64) |
4331 | return Reg; |
4332 | break; |
4333 | default: |
4334 | llvm_unreachable("missing register type checking" ); |
4335 | } |
4336 | |
4337 | report_fatal_error(reason: Twine("invalid type for register \"" |
4338 | + StringRef(RegName) + "\"." )); |
4339 | } |
4340 | |
4341 | // If kill is not the last instruction, split the block so kill is always a |
4342 | // proper terminator. |
4343 | MachineBasicBlock * |
4344 | SITargetLowering::splitKillBlock(MachineInstr &MI, |
4345 | MachineBasicBlock *BB) const { |
4346 | MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/); |
4347 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
4348 | MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode())); |
4349 | return SplitBB; |
4350 | } |
4351 | |
4352 | // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, |
4353 | // \p MI will be the only instruction in the loop body block. Otherwise, it will |
4354 | // be the first instruction in the remainder block. |
4355 | // |
4356 | /// \returns { LoopBody, Remainder } |
4357 | static std::pair<MachineBasicBlock *, MachineBasicBlock *> |
4358 | splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { |
4359 | MachineFunction *MF = MBB.getParent(); |
4360 | MachineBasicBlock::iterator I(&MI); |
4361 | |
4362 | // To insert the loop we need to split the block. Move everything after this |
4363 | // point to a new block, and insert a new empty block between the two. |
4364 | MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); |
4365 | MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); |
4366 | MachineFunction::iterator MBBI(MBB); |
4367 | ++MBBI; |
4368 | |
4369 | MF->insert(MBBI, MBB: LoopBB); |
4370 | MF->insert(MBBI, MBB: RemainderBB); |
4371 | |
4372 | LoopBB->addSuccessor(Succ: LoopBB); |
4373 | LoopBB->addSuccessor(Succ: RemainderBB); |
4374 | |
4375 | // Move the rest of the block into a new block. |
4376 | RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB); |
4377 | |
4378 | if (InstInLoop) { |
4379 | auto Next = std::next(x: I); |
4380 | |
4381 | // Move instruction to loop body. |
4382 | LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next); |
4383 | |
4384 | // Move the rest of the block. |
4385 | RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end()); |
4386 | } else { |
4387 | RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end()); |
4388 | } |
4389 | |
4390 | MBB.addSuccessor(Succ: LoopBB); |
4391 | |
4392 | return std::pair(LoopBB, RemainderBB); |
4393 | } |
4394 | |
4395 | /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it. |
4396 | void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { |
4397 | MachineBasicBlock *MBB = MI.getParent(); |
4398 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
4399 | auto I = MI.getIterator(); |
4400 | auto E = std::next(x: I); |
4401 | |
4402 | BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT)) |
4403 | .addImm(Val: 0); |
4404 | |
4405 | MIBundleBuilder Bundler(*MBB, I, E); |
4406 | finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin()); |
4407 | } |
4408 | |
4409 | MachineBasicBlock * |
4410 | SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, |
4411 | MachineBasicBlock *BB) const { |
4412 | const DebugLoc &DL = MI.getDebugLoc(); |
4413 | |
4414 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
4415 | |
4416 | MachineBasicBlock *LoopBB; |
4417 | MachineBasicBlock *RemainderBB; |
4418 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
4419 | |
4420 | // Apparently kill flags are only valid if the def is in the same block? |
4421 | if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0)) |
4422 | Src->setIsKill(false); |
4423 | |
4424 | std::tie(args&: LoopBB, args&: RemainderBB) = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true); |
4425 | |
4426 | MachineBasicBlock::iterator I = LoopBB->end(); |
4427 | |
4428 | const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode( |
4429 | Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: 1); |
4430 | |
4431 | // Clear TRAP_STS.MEM_VIOL |
4432 | BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32)) |
4433 | .addImm(Val: 0) |
4434 | .addImm(Val: EncodedReg); |
4435 | |
4436 | bundleInstWithWaitcnt(MI); |
4437 | |
4438 | Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
4439 | |
4440 | // Load and check TRAP_STS.MEM_VIOL |
4441 | BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg) |
4442 | .addImm(Val: EncodedReg); |
4443 | |
4444 | // FIXME: Do we need to use an isel pseudo that may clobber scc? |
4445 | BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32)) |
4446 | .addReg(RegNo: Reg, flags: RegState::Kill) |
4447 | .addImm(Val: 0); |
4448 | BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1)) |
4449 | .addMBB(MBB: LoopBB); |
4450 | |
4451 | return RemainderBB; |
4452 | } |
4453 | |
4454 | // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the |
4455 | // wavefront. If the value is uniform and just happens to be in a VGPR, this |
4456 | // will only do one iteration. In the worst case, this will loop 64 times. |
4457 | // |
4458 | // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. |
4459 | static MachineBasicBlock::iterator |
4460 | emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, |
4461 | MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, |
4462 | const DebugLoc &DL, const MachineOperand &Idx, |
4463 | unsigned InitReg, unsigned ResultReg, unsigned PhiReg, |
4464 | unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, |
4465 | Register &SGPRIdxReg) { |
4466 | |
4467 | MachineFunction *MF = OrigBB.getParent(); |
4468 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
4469 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4470 | MachineBasicBlock::iterator I = LoopBB.begin(); |
4471 | |
4472 | const TargetRegisterClass *BoolRC = TRI->getBoolRC(); |
4473 | Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC); |
4474 | Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC); |
4475 | Register CurrentIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass); |
4476 | Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC); |
4477 | |
4478 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg) |
4479 | .addReg(RegNo: InitReg) |
4480 | .addMBB(MBB: &OrigBB) |
4481 | .addReg(RegNo: ResultReg) |
4482 | .addMBB(MBB: &LoopBB); |
4483 | |
4484 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec) |
4485 | .addReg(RegNo: InitSaveExecReg) |
4486 | .addMBB(MBB: &OrigBB) |
4487 | .addReg(RegNo: NewExec) |
4488 | .addMBB(MBB: &LoopBB); |
4489 | |
4490 | // Read the next variant <- also loop target. |
4491 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg) |
4492 | .addReg(RegNo: Idx.getReg(), flags: getUndefRegState(B: Idx.isUndef())); |
4493 | |
4494 | // Compare the just read M0 value to all possible Idx values. |
4495 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg) |
4496 | .addReg(RegNo: CurrentIdxReg) |
4497 | .addReg(RegNo: Idx.getReg(), flags: 0, SubReg: Idx.getSubReg()); |
4498 | |
4499 | // Update EXEC, save the original EXEC value to VCC. |
4500 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 |
4501 | : AMDGPU::S_AND_SAVEEXEC_B64), |
4502 | DestReg: NewExec) |
4503 | .addReg(RegNo: CondReg, flags: RegState::Kill); |
4504 | |
4505 | MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg); |
4506 | |
4507 | if (UseGPRIdxMode) { |
4508 | if (Offset == 0) { |
4509 | SGPRIdxReg = CurrentIdxReg; |
4510 | } else { |
4511 | SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass); |
4512 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg) |
4513 | .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill) |
4514 | .addImm(Val: Offset); |
4515 | } |
4516 | } else { |
4517 | // Move index from VCC into M0 |
4518 | if (Offset == 0) { |
4519 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0) |
4520 | .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill); |
4521 | } else { |
4522 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0) |
4523 | .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill) |
4524 | .addImm(Val: Offset); |
4525 | } |
4526 | } |
4527 | |
4528 | // Update EXEC, switch all done bits to 0 and all todo bits to 1. |
4529 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
4530 | MachineInstr *InsertPt = |
4531 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_XOR_B32_term |
4532 | : AMDGPU::S_XOR_B64_term), DestReg: Exec) |
4533 | .addReg(RegNo: Exec) |
4534 | .addReg(RegNo: NewExec); |
4535 | |
4536 | // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use |
4537 | // s_cbranch_scc0? |
4538 | |
4539 | // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. |
4540 | BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ)) |
4541 | .addMBB(MBB: &LoopBB); |
4542 | |
4543 | return InsertPt->getIterator(); |
4544 | } |
4545 | |
4546 | // This has slightly sub-optimal regalloc when the source vector is killed by |
4547 | // the read. The register allocator does not understand that the kill is |
4548 | // per-workitem, so is kept alive for the whole loop so we end up not re-using a |
4549 | // subregister from it, using 1 more VGPR than necessary. This was saved when |
4550 | // this was expanded after register allocation. |
4551 | static MachineBasicBlock::iterator |
4552 | loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, |
4553 | unsigned InitResultReg, unsigned PhiReg, int Offset, |
4554 | bool UseGPRIdxMode, Register &SGPRIdxReg) { |
4555 | MachineFunction *MF = MBB.getParent(); |
4556 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
4557 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4558 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
4559 | const DebugLoc &DL = MI.getDebugLoc(); |
4560 | MachineBasicBlock::iterator I(&MI); |
4561 | |
4562 | const auto *BoolXExecRC = TRI->getRegClass(RCID: AMDGPU::SReg_1_XEXECRegClassID); |
4563 | Register DstReg = MI.getOperand(i: 0).getReg(); |
4564 | Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC); |
4565 | Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC); |
4566 | unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
4567 | unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
4568 | |
4569 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec); |
4570 | |
4571 | // Save the EXEC mask |
4572 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: MovExecOpc), DestReg: SaveExec) |
4573 | .addReg(RegNo: Exec); |
4574 | |
4575 | MachineBasicBlock *LoopBB; |
4576 | MachineBasicBlock *RemainderBB; |
4577 | std::tie(args&: LoopBB, args&: RemainderBB) = splitBlockForLoop(MI, MBB, InstInLoop: false); |
4578 | |
4579 | const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx); |
4580 | |
4581 | auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx, |
4582 | InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec, |
4583 | Offset, UseGPRIdxMode, SGPRIdxReg); |
4584 | |
4585 | MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock(); |
4586 | MachineFunction::iterator MBBI(LoopBB); |
4587 | ++MBBI; |
4588 | MF->insert(MBBI, MBB: LandingPad); |
4589 | LoopBB->removeSuccessor(Succ: RemainderBB); |
4590 | LandingPad->addSuccessor(Succ: RemainderBB); |
4591 | LoopBB->addSuccessor(Succ: LandingPad); |
4592 | MachineBasicBlock::iterator First = LandingPad->begin(); |
4593 | BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: MovExecOpc), DestReg: Exec) |
4594 | .addReg(RegNo: SaveExec); |
4595 | |
4596 | return InsPt; |
4597 | } |
4598 | |
4599 | // Returns subreg index, offset |
4600 | static std::pair<unsigned, int> |
4601 | computeIndirectRegAndOffset(const SIRegisterInfo &TRI, |
4602 | const TargetRegisterClass *SuperRC, |
4603 | unsigned VecReg, |
4604 | int Offset) { |
4605 | int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / 32; |
4606 | |
4607 | // Skip out of bounds offsets, or else we would end up using an undefined |
4608 | // register. |
4609 | if (Offset >= NumElts || Offset < 0) |
4610 | return std::pair(AMDGPU::sub0, Offset); |
4611 | |
4612 | return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0); |
4613 | } |
4614 | |
4615 | static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, |
4616 | MachineRegisterInfo &MRI, MachineInstr &MI, |
4617 | int Offset) { |
4618 | MachineBasicBlock *MBB = MI.getParent(); |
4619 | const DebugLoc &DL = MI.getDebugLoc(); |
4620 | MachineBasicBlock::iterator I(&MI); |
4621 | |
4622 | const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx); |
4623 | |
4624 | assert(Idx->getReg() != AMDGPU::NoRegister); |
4625 | |
4626 | if (Offset == 0) { |
4627 | BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0).add(MO: *Idx); |
4628 | } else { |
4629 | BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0) |
4630 | .add(MO: *Idx) |
4631 | .addImm(Val: Offset); |
4632 | } |
4633 | } |
4634 | |
4635 | static Register getIndirectSGPRIdx(const SIInstrInfo *TII, |
4636 | MachineRegisterInfo &MRI, MachineInstr &MI, |
4637 | int Offset) { |
4638 | MachineBasicBlock *MBB = MI.getParent(); |
4639 | const DebugLoc &DL = MI.getDebugLoc(); |
4640 | MachineBasicBlock::iterator I(&MI); |
4641 | |
4642 | const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx); |
4643 | |
4644 | if (Offset == 0) |
4645 | return Idx->getReg(); |
4646 | |
4647 | Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass); |
4648 | BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp) |
4649 | .add(MO: *Idx) |
4650 | .addImm(Val: Offset); |
4651 | return Tmp; |
4652 | } |
4653 | |
4654 | static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, |
4655 | MachineBasicBlock &MBB, |
4656 | const GCNSubtarget &ST) { |
4657 | const SIInstrInfo *TII = ST.getInstrInfo(); |
4658 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
4659 | MachineFunction *MF = MBB.getParent(); |
4660 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
4661 | |
4662 | Register Dst = MI.getOperand(i: 0).getReg(); |
4663 | const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx); |
4664 | Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg(); |
4665 | int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm(); |
4666 | |
4667 | const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg); |
4668 | const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg()); |
4669 | |
4670 | unsigned SubReg; |
4671 | std::tie(args&: SubReg, args&: Offset) |
4672 | = computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset); |
4673 | |
4674 | const bool UseGPRIdxMode = ST.useVGPRIndexMode(); |
4675 | |
4676 | // Check for a SGPR index. |
4677 | if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) { |
4678 | MachineBasicBlock::iterator I(&MI); |
4679 | const DebugLoc &DL = MI.getDebugLoc(); |
4680 | |
4681 | if (UseGPRIdxMode) { |
4682 | // TODO: Look at the uses to avoid the copy. This may require rescheduling |
4683 | // to avoid interfering with other uses, so probably requires a new |
4684 | // optimization pass. |
4685 | Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); |
4686 | |
4687 | const MCInstrDesc &GPRIDXDesc = |
4688 | TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true); |
4689 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst) |
4690 | .addReg(RegNo: SrcReg) |
4691 | .addReg(RegNo: Idx) |
4692 | .addImm(Val: SubReg); |
4693 | } else { |
4694 | setM0ToIndexFromSGPR(TII, MRI, MI, Offset); |
4695 | |
4696 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst) |
4697 | .addReg(RegNo: SrcReg, flags: 0, SubReg) |
4698 | .addReg(RegNo: SrcReg, flags: RegState::Implicit); |
4699 | } |
4700 | |
4701 | MI.eraseFromParent(); |
4702 | |
4703 | return &MBB; |
4704 | } |
4705 | |
4706 | // Control flow needs to be inserted if indexing with a VGPR. |
4707 | const DebugLoc &DL = MI.getDebugLoc(); |
4708 | MachineBasicBlock::iterator I(&MI); |
4709 | |
4710 | Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
4711 | Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
4712 | |
4713 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg); |
4714 | |
4715 | Register SGPRIdxReg; |
4716 | auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset, |
4717 | UseGPRIdxMode, SGPRIdxReg); |
4718 | |
4719 | MachineBasicBlock *LoopBB = InsPt->getParent(); |
4720 | |
4721 | if (UseGPRIdxMode) { |
4722 | const MCInstrDesc &GPRIDXDesc = |
4723 | TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true); |
4724 | |
4725 | BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst) |
4726 | .addReg(RegNo: SrcReg) |
4727 | .addReg(RegNo: SGPRIdxReg) |
4728 | .addImm(Val: SubReg); |
4729 | } else { |
4730 | BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst) |
4731 | .addReg(RegNo: SrcReg, flags: 0, SubReg) |
4732 | .addReg(RegNo: SrcReg, flags: RegState::Implicit); |
4733 | } |
4734 | |
4735 | MI.eraseFromParent(); |
4736 | |
4737 | return LoopBB; |
4738 | } |
4739 | |
4740 | static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, |
4741 | MachineBasicBlock &MBB, |
4742 | const GCNSubtarget &ST) { |
4743 | const SIInstrInfo *TII = ST.getInstrInfo(); |
4744 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
4745 | MachineFunction *MF = MBB.getParent(); |
4746 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
4747 | |
4748 | Register Dst = MI.getOperand(i: 0).getReg(); |
4749 | const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src); |
4750 | const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx); |
4751 | const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val); |
4752 | int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm(); |
4753 | const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg()); |
4754 | const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg()); |
4755 | |
4756 | // This can be an immediate, but will be folded later. |
4757 | assert(Val->getReg()); |
4758 | |
4759 | unsigned SubReg; |
4760 | std::tie(args&: SubReg, args&: Offset) = computeIndirectRegAndOffset(TRI, SuperRC: VecRC, |
4761 | VecReg: SrcVec->getReg(), |
4762 | Offset); |
4763 | const bool UseGPRIdxMode = ST.useVGPRIndexMode(); |
4764 | |
4765 | if (Idx->getReg() == AMDGPU::NoRegister) { |
4766 | MachineBasicBlock::iterator I(&MI); |
4767 | const DebugLoc &DL = MI.getDebugLoc(); |
4768 | |
4769 | assert(Offset == 0); |
4770 | |
4771 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst) |
4772 | .add(MO: *SrcVec) |
4773 | .add(MO: *Val) |
4774 | .addImm(Val: SubReg); |
4775 | |
4776 | MI.eraseFromParent(); |
4777 | return &MBB; |
4778 | } |
4779 | |
4780 | // Check for a SGPR index. |
4781 | if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) { |
4782 | MachineBasicBlock::iterator I(&MI); |
4783 | const DebugLoc &DL = MI.getDebugLoc(); |
4784 | |
4785 | if (UseGPRIdxMode) { |
4786 | Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); |
4787 | |
4788 | const MCInstrDesc &GPRIDXDesc = |
4789 | TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false); |
4790 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst) |
4791 | .addReg(RegNo: SrcVec->getReg()) |
4792 | .add(MO: *Val) |
4793 | .addReg(RegNo: Idx) |
4794 | .addImm(Val: SubReg); |
4795 | } else { |
4796 | setM0ToIndexFromSGPR(TII, MRI, MI, Offset); |
4797 | |
4798 | const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( |
4799 | VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false); |
4800 | BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst) |
4801 | .addReg(RegNo: SrcVec->getReg()) |
4802 | .add(MO: *Val) |
4803 | .addImm(Val: SubReg); |
4804 | } |
4805 | MI.eraseFromParent(); |
4806 | return &MBB; |
4807 | } |
4808 | |
4809 | // Control flow needs to be inserted if indexing with a VGPR. |
4810 | if (Val->isReg()) |
4811 | MRI.clearKillFlags(Reg: Val->getReg()); |
4812 | |
4813 | const DebugLoc &DL = MI.getDebugLoc(); |
4814 | |
4815 | Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC); |
4816 | |
4817 | Register SGPRIdxReg; |
4818 | auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset, |
4819 | UseGPRIdxMode, SGPRIdxReg); |
4820 | MachineBasicBlock *LoopBB = InsPt->getParent(); |
4821 | |
4822 | if (UseGPRIdxMode) { |
4823 | const MCInstrDesc &GPRIDXDesc = |
4824 | TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false); |
4825 | |
4826 | BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst) |
4827 | .addReg(RegNo: PhiReg) |
4828 | .add(MO: *Val) |
4829 | .addReg(RegNo: SGPRIdxReg) |
4830 | .addImm(Val: SubReg); |
4831 | } else { |
4832 | const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( |
4833 | VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false); |
4834 | BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst) |
4835 | .addReg(RegNo: PhiReg) |
4836 | .add(MO: *Val) |
4837 | .addImm(Val: SubReg); |
4838 | } |
4839 | |
4840 | MI.eraseFromParent(); |
4841 | return LoopBB; |
4842 | } |
4843 | |
4844 | static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, |
4845 | MachineBasicBlock &BB, |
4846 | const GCNSubtarget &ST, |
4847 | unsigned Opc) { |
4848 | MachineRegisterInfo &MRI = BB.getParent()->getRegInfo(); |
4849 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
4850 | const DebugLoc &DL = MI.getDebugLoc(); |
4851 | const SIInstrInfo *TII = ST.getInstrInfo(); |
4852 | |
4853 | // Reduction operations depend on whether the input operand is SGPR or VGPR. |
4854 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
4855 | bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg)); |
4856 | Register DstReg = MI.getOperand(i: 0).getReg(); |
4857 | MachineBasicBlock *RetBB = nullptr; |
4858 | if (isSGPR) { |
4859 | // These operations with a uniform value i.e. SGPR are idempotent. |
4860 | // Reduced value will be same as given sgpr. |
4861 | BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg); |
4862 | RetBB = &BB; |
4863 | } else { |
4864 | // TODO: Implement DPP Strategy and switch based on immediate strategy |
4865 | // operand. For now, for all the cases (default, Iterative and DPP we use |
4866 | // iterative approach by default.) |
4867 | |
4868 | // To reduce the VGPR using iterative approach, we need to iterate |
4869 | // over all the active lanes. Lowering consists of ComputeLoop, |
4870 | // which iterate over only active lanes. We use copy of EXEC register |
4871 | // as induction variable and every active lane modifies it using bitset0 |
4872 | // so that we will get the next active lane for next iteration. |
4873 | MachineBasicBlock::iterator I = BB.end(); |
4874 | Register SrcReg = MI.getOperand(i: 1).getReg(); |
4875 | |
4876 | // Create Control flow for loop |
4877 | // Split MI's Machine Basic block into For loop |
4878 | auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true); |
4879 | |
4880 | // Create virtual registers required for lowering. |
4881 | const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); |
4882 | const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg); |
4883 | Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass); |
4884 | Register InitalValReg = MRI.createVirtualRegister(RegClass: DstRegClass); |
4885 | |
4886 | Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass); |
4887 | Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass); |
4888 | Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass); |
4889 | |
4890 | Register FF1Reg = MRI.createVirtualRegister(RegClass: DstRegClass); |
4891 | Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass); |
4892 | |
4893 | bool IsWave32 = ST.isWave32(); |
4894 | unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
4895 | unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
4896 | |
4897 | // Create initail values of induction variable from Exec, Accumulator and |
4898 | // insert branch instr to newly created ComputeBlockk |
4899 | uint32_t InitalValue = |
4900 | (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0; |
4901 | auto TmpSReg = |
4902 | BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: LoopIterator).addReg(RegNo: ExecReg); |
4903 | BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: InitalValReg) |
4904 | .addImm(Val: InitalValue); |
4905 | BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH)).addMBB(MBB: ComputeLoop); |
4906 | |
4907 | // Start constructing ComputeLoop |
4908 | I = ComputeLoop->end(); |
4909 | auto Accumulator = |
4910 | BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg) |
4911 | .addReg(RegNo: InitalValReg) |
4912 | .addMBB(MBB: &BB); |
4913 | auto ActiveBits = |
4914 | BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg) |
4915 | .addReg(RegNo: TmpSReg->getOperand(i: 0).getReg()) |
4916 | .addMBB(MBB: &BB); |
4917 | |
4918 | // Perform the computations |
4919 | unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; |
4920 | auto FF1 = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg) |
4921 | .addReg(RegNo: ActiveBits->getOperand(i: 0).getReg()); |
4922 | auto LaneValue = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, |
4923 | MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32), DestReg: LaneValueReg) |
4924 | .addReg(RegNo: SrcReg) |
4925 | .addReg(RegNo: FF1->getOperand(i: 0).getReg()); |
4926 | auto NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg) |
4927 | .addReg(RegNo: Accumulator->getOperand(i: 0).getReg()) |
4928 | .addReg(RegNo: LaneValue->getOperand(i: 0).getReg()); |
4929 | |
4930 | // Manipulate the iterator to get the next active lane |
4931 | unsigned BITSETOpc = |
4932 | IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; |
4933 | auto NewActiveBits = |
4934 | BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg) |
4935 | .addReg(RegNo: FF1->getOperand(i: 0).getReg()) |
4936 | .addReg(RegNo: ActiveBits->getOperand(i: 0).getReg()); |
4937 | |
4938 | // Add phi nodes |
4939 | Accumulator.addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg()) |
4940 | .addMBB(MBB: ComputeLoop); |
4941 | ActiveBits.addReg(RegNo: NewActiveBits->getOperand(i: 0).getReg()) |
4942 | .addMBB(MBB: ComputeLoop); |
4943 | |
4944 | // Creating branching |
4945 | unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; |
4946 | BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc)) |
4947 | .addReg(RegNo: NewActiveBits->getOperand(i: 0).getReg()) |
4948 | .addImm(Val: 0); |
4949 | BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1)) |
4950 | .addMBB(MBB: ComputeLoop); |
4951 | |
4952 | RetBB = ComputeEnd; |
4953 | } |
4954 | MI.eraseFromParent(); |
4955 | return RetBB; |
4956 | } |
4957 | |
4958 | MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( |
4959 | MachineInstr &MI, MachineBasicBlock *BB) const { |
4960 | |
4961 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
4962 | MachineFunction *MF = BB->getParent(); |
4963 | SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
4964 | |
4965 | switch (MI.getOpcode()) { |
4966 | case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: |
4967 | return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_U32); |
4968 | case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: |
4969 | return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_U32); |
4970 | case AMDGPU::S_UADDO_PSEUDO: |
4971 | case AMDGPU::S_USUBO_PSEUDO: { |
4972 | const DebugLoc &DL = MI.getDebugLoc(); |
4973 | MachineOperand &Dest0 = MI.getOperand(i: 0); |
4974 | MachineOperand &Dest1 = MI.getOperand(i: 1); |
4975 | MachineOperand &Src0 = MI.getOperand(i: 2); |
4976 | MachineOperand &Src1 = MI.getOperand(i: 3); |
4977 | |
4978 | unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) |
4979 | ? AMDGPU::S_ADD_I32 |
4980 | : AMDGPU::S_SUB_I32; |
4981 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg()).add(MO: Src0).add(MO: Src1); |
4982 | |
4983 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: Dest1.getReg()) |
4984 | .addImm(Val: 1) |
4985 | .addImm(Val: 0); |
4986 | |
4987 | MI.eraseFromParent(); |
4988 | return BB; |
4989 | } |
4990 | case AMDGPU::S_ADD_U64_PSEUDO: |
4991 | case AMDGPU::S_SUB_U64_PSEUDO: { |
4992 | // For targets older than GFX12, we emit a sequence of 32-bit operations. |
4993 | // For GFX12, we emit s_add_u64 and s_sub_u64. |
4994 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
4995 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
4996 | const DebugLoc &DL = MI.getDebugLoc(); |
4997 | MachineOperand &Dest = MI.getOperand(i: 0); |
4998 | MachineOperand &Src0 = MI.getOperand(i: 1); |
4999 | MachineOperand &Src1 = MI.getOperand(i: 2); |
5000 | bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); |
5001 | if (Subtarget->hasScalarAddSub64()) { |
5002 | unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64; |
5003 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg()) |
5004 | .add(MO: Src0) |
5005 | .add(MO: Src1); |
5006 | } else { |
5007 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
5008 | const TargetRegisterClass *BoolRC = TRI->getBoolRC(); |
5009 | |
5010 | Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5011 | Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5012 | |
5013 | MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( |
5014 | MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass); |
5015 | MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( |
5016 | MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass); |
5017 | |
5018 | MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( |
5019 | MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass); |
5020 | MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( |
5021 | MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass); |
5022 | |
5023 | unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; |
5024 | unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; |
5025 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0) |
5026 | .add(MO: Src0Sub0) |
5027 | .add(MO: Src1Sub0); |
5028 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1) |
5029 | .add(MO: Src0Sub1) |
5030 | .add(MO: Src1Sub1); |
5031 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg()) |
5032 | .addReg(RegNo: DestSub0) |
5033 | .addImm(Val: AMDGPU::sub0) |
5034 | .addReg(RegNo: DestSub1) |
5035 | .addImm(Val: AMDGPU::sub1); |
5036 | } |
5037 | MI.eraseFromParent(); |
5038 | return BB; |
5039 | } |
5040 | case AMDGPU::V_ADD_U64_PSEUDO: |
5041 | case AMDGPU::V_SUB_U64_PSEUDO: { |
5042 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5043 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
5044 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
5045 | const DebugLoc &DL = MI.getDebugLoc(); |
5046 | |
5047 | bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); |
5048 | |
5049 | MachineOperand &Dest = MI.getOperand(i: 0); |
5050 | MachineOperand &Src0 = MI.getOperand(i: 1); |
5051 | MachineOperand &Src1 = MI.getOperand(i: 2); |
5052 | |
5053 | if (IsAdd && ST.hasLshlAddB64()) { |
5054 | auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64), |
5055 | DestReg: Dest.getReg()) |
5056 | .add(MO: Src0) |
5057 | .addImm(Val: 0) |
5058 | .add(MO: Src1); |
5059 | TII->legalizeOperands(MI&: *Add); |
5060 | MI.eraseFromParent(); |
5061 | return BB; |
5062 | } |
5063 | |
5064 | const auto *CarryRC = TRI->getRegClass(RCID: AMDGPU::SReg_1_XEXECRegClassID); |
5065 | |
5066 | Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
5067 | Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
5068 | |
5069 | Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC); |
5070 | Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC); |
5071 | |
5072 | const TargetRegisterClass *Src0RC = Src0.isReg() |
5073 | ? MRI.getRegClass(Reg: Src0.getReg()) |
5074 | : &AMDGPU::VReg_64RegClass; |
5075 | const TargetRegisterClass *Src1RC = Src1.isReg() |
5076 | ? MRI.getRegClass(Reg: Src1.getReg()) |
5077 | : &AMDGPU::VReg_64RegClass; |
5078 | |
5079 | const TargetRegisterClass *Src0SubRC = |
5080 | TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); |
5081 | const TargetRegisterClass *Src1SubRC = |
5082 | TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); |
5083 | |
5084 | MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( |
5085 | MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC); |
5086 | MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( |
5087 | MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC); |
5088 | |
5089 | MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( |
5090 | MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC); |
5091 | MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( |
5092 | MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC); |
5093 | |
5094 | unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; |
5095 | MachineInstr *LoHalf = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0) |
5096 | .addReg(RegNo: CarryReg, flags: RegState::Define) |
5097 | .add(MO: SrcReg0Sub0) |
5098 | .add(MO: SrcReg1Sub0) |
5099 | .addImm(Val: 0); // clamp bit |
5100 | |
5101 | unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; |
5102 | MachineInstr *HiHalf = |
5103 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1) |
5104 | .addReg(RegNo: DeadCarryReg, flags: RegState::Define | RegState::Dead) |
5105 | .add(MO: SrcReg0Sub1) |
5106 | .add(MO: SrcReg1Sub1) |
5107 | .addReg(RegNo: CarryReg, flags: RegState::Kill) |
5108 | .addImm(Val: 0); // clamp bit |
5109 | |
5110 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg()) |
5111 | .addReg(RegNo: DestSub0) |
5112 | .addImm(Val: AMDGPU::sub0) |
5113 | .addReg(RegNo: DestSub1) |
5114 | .addImm(Val: AMDGPU::sub1); |
5115 | TII->legalizeOperands(MI&: *LoHalf); |
5116 | TII->legalizeOperands(MI&: *HiHalf); |
5117 | MI.eraseFromParent(); |
5118 | return BB; |
5119 | } |
5120 | case AMDGPU::S_ADD_CO_PSEUDO: |
5121 | case AMDGPU::S_SUB_CO_PSEUDO: { |
5122 | // This pseudo has a chance to be selected |
5123 | // only from uniform add/subcarry node. All the VGPR operands |
5124 | // therefore assumed to be splat vectors. |
5125 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5126 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
5127 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
5128 | MachineBasicBlock::iterator MII = MI; |
5129 | const DebugLoc &DL = MI.getDebugLoc(); |
5130 | MachineOperand &Dest = MI.getOperand(i: 0); |
5131 | MachineOperand &CarryDest = MI.getOperand(i: 1); |
5132 | MachineOperand &Src0 = MI.getOperand(i: 2); |
5133 | MachineOperand &Src1 = MI.getOperand(i: 3); |
5134 | MachineOperand &Src2 = MI.getOperand(i: 4); |
5135 | unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) |
5136 | ? AMDGPU::S_ADDC_U32 |
5137 | : AMDGPU::S_SUBB_U32; |
5138 | if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) { |
5139 | Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5140 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0) |
5141 | .addReg(RegNo: Src0.getReg()); |
5142 | Src0.setReg(RegOp0); |
5143 | } |
5144 | if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) { |
5145 | Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5146 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1) |
5147 | .addReg(RegNo: Src1.getReg()); |
5148 | Src1.setReg(RegOp1); |
5149 | } |
5150 | Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5151 | if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) { |
5152 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2) |
5153 | .addReg(RegNo: Src2.getReg()); |
5154 | Src2.setReg(RegOp2); |
5155 | } |
5156 | |
5157 | const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg()); |
5158 | unsigned WaveSize = TRI->getRegSizeInBits(RC: *Src2RC); |
5159 | assert(WaveSize == 64 || WaveSize == 32); |
5160 | |
5161 | if (WaveSize == 64) { |
5162 | if (ST.hasScalarCompareEq64()) { |
5163 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64)) |
5164 | .addReg(RegNo: Src2.getReg()) |
5165 | .addImm(Val: 0); |
5166 | } else { |
5167 | const TargetRegisterClass *SubRC = |
5168 | TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0); |
5169 | MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( |
5170 | MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC); |
5171 | MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm( |
5172 | MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC); |
5173 | Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5174 | |
5175 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32) |
5176 | .add(MO: Src2Sub0) |
5177 | .add(MO: Src2Sub1); |
5178 | |
5179 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32)) |
5180 | .addReg(RegNo: Src2_32, flags: RegState::Kill) |
5181 | .addImm(Val: 0); |
5182 | } |
5183 | } else { |
5184 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32)) |
5185 | .addReg(RegNo: Src2.getReg()) |
5186 | .addImm(Val: 0); |
5187 | } |
5188 | |
5189 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg()).add(MO: Src0).add(MO: Src1); |
5190 | |
5191 | unsigned SelOpc = |
5192 | (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; |
5193 | |
5194 | BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg()) |
5195 | .addImm(Val: -1) |
5196 | .addImm(Val: 0); |
5197 | |
5198 | MI.eraseFromParent(); |
5199 | return BB; |
5200 | } |
5201 | case AMDGPU::SI_INIT_M0: { |
5202 | BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(), |
5203 | MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0) |
5204 | .add(MO: MI.getOperand(i: 0)); |
5205 | MI.eraseFromParent(); |
5206 | return BB; |
5207 | } |
5208 | case AMDGPU::GET_GROUPSTATICSIZE: { |
5209 | assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || |
5210 | getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); |
5211 | DebugLoc DL = MI.getDebugLoc(); |
5212 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32)) |
5213 | .add(MO: MI.getOperand(i: 0)) |
5214 | .addImm(Val: MFI->getLDSSize()); |
5215 | MI.eraseFromParent(); |
5216 | return BB; |
5217 | } |
5218 | case AMDGPU::GET_SHADERCYCLESHILO: { |
5219 | assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); |
5220 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
5221 | const DebugLoc &DL = MI.getDebugLoc(); |
5222 | // The algorithm is: |
5223 | // |
5224 | // hi1 = getreg(SHADER_CYCLES_HI) |
5225 | // lo1 = getreg(SHADER_CYCLES_LO) |
5226 | // hi2 = getreg(SHADER_CYCLES_HI) |
5227 | // |
5228 | // If hi1 == hi2 then there was no overflow and the result is hi2:lo1. |
5229 | // Otherwise there was overflow and the result is hi2:0. In both cases the |
5230 | // result should represent the actual time at some point during the sequence |
5231 | // of three getregs. |
5232 | using namespace AMDGPU::Hwreg; |
5233 | Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5234 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1) |
5235 | .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32)); |
5236 | Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5237 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1) |
5238 | .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: 0, Values: 32)); |
5239 | Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5240 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2) |
5241 | .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32)); |
5242 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32)) |
5243 | .addReg(RegNo: RegHi1) |
5244 | .addReg(RegNo: RegHi2); |
5245 | Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass); |
5246 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo) |
5247 | .addReg(RegNo: RegLo1) |
5248 | .addImm(Val: 0); |
5249 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE)) |
5250 | .add(MO: MI.getOperand(i: 0)) |
5251 | .addReg(RegNo: RegLo) |
5252 | .addImm(Val: AMDGPU::sub0) |
5253 | .addReg(RegNo: RegHi2) |
5254 | .addImm(Val: AMDGPU::sub1); |
5255 | MI.eraseFromParent(); |
5256 | return BB; |
5257 | } |
5258 | case AMDGPU::SI_INDIRECT_SRC_V1: |
5259 | case AMDGPU::SI_INDIRECT_SRC_V2: |
5260 | case AMDGPU::SI_INDIRECT_SRC_V4: |
5261 | case AMDGPU::SI_INDIRECT_SRC_V8: |
5262 | case AMDGPU::SI_INDIRECT_SRC_V9: |
5263 | case AMDGPU::SI_INDIRECT_SRC_V10: |
5264 | case AMDGPU::SI_INDIRECT_SRC_V11: |
5265 | case AMDGPU::SI_INDIRECT_SRC_V12: |
5266 | case AMDGPU::SI_INDIRECT_SRC_V16: |
5267 | case AMDGPU::SI_INDIRECT_SRC_V32: |
5268 | return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget()); |
5269 | case AMDGPU::SI_INDIRECT_DST_V1: |
5270 | case AMDGPU::SI_INDIRECT_DST_V2: |
5271 | case AMDGPU::SI_INDIRECT_DST_V4: |
5272 | case AMDGPU::SI_INDIRECT_DST_V8: |
5273 | case AMDGPU::SI_INDIRECT_DST_V9: |
5274 | case AMDGPU::SI_INDIRECT_DST_V10: |
5275 | case AMDGPU::SI_INDIRECT_DST_V11: |
5276 | case AMDGPU::SI_INDIRECT_DST_V12: |
5277 | case AMDGPU::SI_INDIRECT_DST_V16: |
5278 | case AMDGPU::SI_INDIRECT_DST_V32: |
5279 | return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget()); |
5280 | case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: |
5281 | case AMDGPU::SI_KILL_I1_PSEUDO: |
5282 | return splitKillBlock(MI, BB); |
5283 | case AMDGPU::V_CNDMASK_B64_PSEUDO: { |
5284 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5285 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
5286 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
5287 | |
5288 | Register Dst = MI.getOperand(i: 0).getReg(); |
5289 | const MachineOperand &Src0 = MI.getOperand(i: 1); |
5290 | const MachineOperand &Src1 = MI.getOperand(i: 2); |
5291 | const DebugLoc &DL = MI.getDebugLoc(); |
5292 | Register SrcCond = MI.getOperand(i: 3).getReg(); |
5293 | |
5294 | Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
5295 | Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
5296 | const auto *CondRC = TRI->getRegClass(RCID: AMDGPU::SReg_1_XEXECRegClassID); |
5297 | Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC); |
5298 | |
5299 | const TargetRegisterClass *Src0RC = Src0.isReg() |
5300 | ? MRI.getRegClass(Reg: Src0.getReg()) |
5301 | : &AMDGPU::VReg_64RegClass; |
5302 | const TargetRegisterClass *Src1RC = Src1.isReg() |
5303 | ? MRI.getRegClass(Reg: Src1.getReg()) |
5304 | : &AMDGPU::VReg_64RegClass; |
5305 | |
5306 | const TargetRegisterClass *Src0SubRC = |
5307 | TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); |
5308 | const TargetRegisterClass *Src1SubRC = |
5309 | TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); |
5310 | |
5311 | MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( |
5312 | MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC); |
5313 | MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( |
5314 | MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC); |
5315 | |
5316 | MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( |
5317 | MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC); |
5318 | MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( |
5319 | MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC); |
5320 | |
5321 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy) |
5322 | .addReg(RegNo: SrcCond); |
5323 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo) |
5324 | .addImm(Val: 0) |
5325 | .add(MO: Src0Sub0) |
5326 | .addImm(Val: 0) |
5327 | .add(MO: Src1Sub0) |
5328 | .addReg(RegNo: SrcCondCopy); |
5329 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi) |
5330 | .addImm(Val: 0) |
5331 | .add(MO: Src0Sub1) |
5332 | .addImm(Val: 0) |
5333 | .add(MO: Src1Sub1) |
5334 | .addReg(RegNo: SrcCondCopy); |
5335 | |
5336 | BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst) |
5337 | .addReg(RegNo: DstLo) |
5338 | .addImm(Val: AMDGPU::sub0) |
5339 | .addReg(RegNo: DstHi) |
5340 | .addImm(Val: AMDGPU::sub1); |
5341 | MI.eraseFromParent(); |
5342 | return BB; |
5343 | } |
5344 | case AMDGPU::SI_BR_UNDEF: { |
5345 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
5346 | const DebugLoc &DL = MI.getDebugLoc(); |
5347 | MachineInstr *Br = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1)) |
5348 | .add(MO: MI.getOperand(i: 0)); |
5349 | Br->getOperand(i: 1).setIsUndef(); // read undef SCC |
5350 | MI.eraseFromParent(); |
5351 | return BB; |
5352 | } |
5353 | case AMDGPU::ADJCALLSTACKUP: |
5354 | case AMDGPU::ADJCALLSTACKDOWN: { |
5355 | const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
5356 | MachineInstrBuilder MIB(*MF, &MI); |
5357 | MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::ImplicitDefine) |
5358 | .addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::Implicit); |
5359 | return BB; |
5360 | } |
5361 | case AMDGPU::SI_CALL_ISEL: { |
5362 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
5363 | const DebugLoc &DL = MI.getDebugLoc(); |
5364 | |
5365 | unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF); |
5366 | |
5367 | MachineInstrBuilder MIB; |
5368 | MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg); |
5369 | |
5370 | for (const MachineOperand &MO : MI.operands()) |
5371 | MIB.add(MO); |
5372 | |
5373 | MIB.cloneMemRefs(OtherMI: MI); |
5374 | MI.eraseFromParent(); |
5375 | return BB; |
5376 | } |
5377 | case AMDGPU::V_ADD_CO_U32_e32: |
5378 | case AMDGPU::V_SUB_CO_U32_e32: |
5379 | case AMDGPU::V_SUBREV_CO_U32_e32: { |
5380 | // TODO: Define distinct V_*_I32_Pseudo instructions instead. |
5381 | const DebugLoc &DL = MI.getDebugLoc(); |
5382 | unsigned Opc = MI.getOpcode(); |
5383 | |
5384 | bool NeedClampOperand = false; |
5385 | if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) { |
5386 | Opc = AMDGPU::getVOPe64(Opcode: Opc); |
5387 | NeedClampOperand = true; |
5388 | } |
5389 | |
5390 | auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg()); |
5391 | if (TII->isVOP3(MI: *I)) { |
5392 | const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
5393 | const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
5394 | I.addReg(RegNo: TRI->getVCC(), flags: RegState::Define); |
5395 | } |
5396 | I.add(MO: MI.getOperand(i: 1)) |
5397 | .add(MO: MI.getOperand(i: 2)); |
5398 | if (NeedClampOperand) |
5399 | I.addImm(Val: 0); // clamp bit for e64 encoding |
5400 | |
5401 | TII->legalizeOperands(MI&: *I); |
5402 | |
5403 | MI.eraseFromParent(); |
5404 | return BB; |
5405 | } |
5406 | case AMDGPU::V_ADDC_U32_e32: |
5407 | case AMDGPU::V_SUBB_U32_e32: |
5408 | case AMDGPU::V_SUBBREV_U32_e32: |
5409 | // These instructions have an implicit use of vcc which counts towards the |
5410 | // constant bus limit. |
5411 | TII->legalizeOperands(MI); |
5412 | return BB; |
5413 | case AMDGPU::DS_GWS_INIT: |
5414 | case AMDGPU::DS_GWS_SEMA_BR: |
5415 | case AMDGPU::DS_GWS_BARRIER: |
5416 | TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::data0); |
5417 | [[fallthrough]]; |
5418 | case AMDGPU::DS_GWS_SEMA_V: |
5419 | case AMDGPU::DS_GWS_SEMA_P: |
5420 | case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: |
5421 | // A s_waitcnt 0 is required to be the instruction immediately following. |
5422 | if (getSubtarget()->hasGWSAutoReplay()) { |
5423 | bundleInstWithWaitcnt(MI); |
5424 | return BB; |
5425 | } |
5426 | |
5427 | return emitGWSMemViolTestLoop(MI, BB); |
5428 | case AMDGPU::S_SETREG_B32: { |
5429 | // Try to optimize cases that only set the denormal mode or rounding mode. |
5430 | // |
5431 | // If the s_setreg_b32 fully sets all of the bits in the rounding mode or |
5432 | // denormal mode to a constant, we can use s_round_mode or s_denorm_mode |
5433 | // instead. |
5434 | // |
5435 | // FIXME: This could be predicates on the immediate, but tablegen doesn't |
5436 | // allow you to have a no side effect instruction in the output of a |
5437 | // sideeffecting pattern. |
5438 | auto [ID, Offset, Width] = |
5439 | AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: 1).getImm()); |
5440 | if (ID != AMDGPU::Hwreg::ID_MODE) |
5441 | return BB; |
5442 | |
5443 | const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width); |
5444 | const unsigned SetMask = WidthMask << Offset; |
5445 | |
5446 | if (getSubtarget()->hasDenormModeInst()) { |
5447 | unsigned SetDenormOp = 0; |
5448 | unsigned SetRoundOp = 0; |
5449 | |
5450 | // The dedicated instructions can only set the whole denorm or round mode |
5451 | // at once, not a subset of bits in either. |
5452 | if (SetMask == |
5453 | (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) { |
5454 | // If this fully sets both the round and denorm mode, emit the two |
5455 | // dedicated instructions for these. |
5456 | SetRoundOp = AMDGPU::S_ROUND_MODE; |
5457 | SetDenormOp = AMDGPU::S_DENORM_MODE; |
5458 | } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) { |
5459 | SetRoundOp = AMDGPU::S_ROUND_MODE; |
5460 | } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) { |
5461 | SetDenormOp = AMDGPU::S_DENORM_MODE; |
5462 | } |
5463 | |
5464 | if (SetRoundOp || SetDenormOp) { |
5465 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5466 | MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg()); |
5467 | if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) { |
5468 | unsigned ImmVal = Def->getOperand(i: 1).getImm(); |
5469 | if (SetRoundOp) { |
5470 | BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp)) |
5471 | .addImm(Val: ImmVal & 0xf); |
5472 | |
5473 | // If we also have the denorm mode, get just the denorm mode bits. |
5474 | ImmVal >>= 4; |
5475 | } |
5476 | |
5477 | if (SetDenormOp) { |
5478 | BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp)) |
5479 | .addImm(Val: ImmVal & 0xf); |
5480 | } |
5481 | |
5482 | MI.eraseFromParent(); |
5483 | return BB; |
5484 | } |
5485 | } |
5486 | } |
5487 | |
5488 | // If only FP bits are touched, used the no side effects pseudo. |
5489 | if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | |
5490 | AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) |
5491 | MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode)); |
5492 | |
5493 | return BB; |
5494 | } |
5495 | case AMDGPU::S_INVERSE_BALLOT_U32: |
5496 | case AMDGPU::S_INVERSE_BALLOT_U64: |
5497 | // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if |
5498 | // necessary. After that they are equivalent to a COPY. |
5499 | MI.setDesc(TII->get(Opcode: AMDGPU::COPY)); |
5500 | return BB; |
5501 | case AMDGPU::ENDPGM_TRAP: { |
5502 | const DebugLoc &DL = MI.getDebugLoc(); |
5503 | if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) { |
5504 | MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM)); |
5505 | MI.addOperand(Op: MachineOperand::CreateImm(Val: 0)); |
5506 | return BB; |
5507 | } |
5508 | |
5509 | // We need a block split to make the real endpgm a terminator. We also don't |
5510 | // want to break phis in successor blocks, so we can't just delete to the |
5511 | // end of the block. |
5512 | |
5513 | MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/); |
5514 | MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); |
5515 | MF->push_back(MBB: TrapBB); |
5516 | BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM)) |
5517 | .addImm(Val: 0); |
5518 | BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ)) |
5519 | .addMBB(MBB: TrapBB); |
5520 | |
5521 | BB->addSuccessor(Succ: TrapBB); |
5522 | MI.eraseFromParent(); |
5523 | return SplitBB; |
5524 | } |
5525 | case AMDGPU::SIMULATED_TRAP: { |
5526 | assert(Subtarget->hasPrivEnabledTrap2NopBug()); |
5527 | MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); |
5528 | MachineBasicBlock *SplitBB = |
5529 | TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc()); |
5530 | MI.eraseFromParent(); |
5531 | return SplitBB; |
5532 | } |
5533 | default: |
5534 | if (TII->isImage(MI) || TII->isMUBUF(MI)) { |
5535 | if (!MI.mayStore()) |
5536 | AddMemOpInit(MI); |
5537 | return BB; |
5538 | } |
5539 | return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB); |
5540 | } |
5541 | } |
5542 | |
5543 | bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { |
5544 | // This currently forces unfolding various combinations of fsub into fma with |
5545 | // free fneg'd operands. As long as we have fast FMA (controlled by |
5546 | // isFMAFasterThanFMulAndFAdd), we should perform these. |
5547 | |
5548 | // When fma is quarter rate, for f64 where add / sub are at best half rate, |
5549 | // most of these combines appear to be cycle neutral but save on instruction |
5550 | // count / code size. |
5551 | return true; |
5552 | } |
5553 | |
5554 | bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; } |
5555 | |
5556 | EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, |
5557 | EVT VT) const { |
5558 | if (!VT.isVector()) { |
5559 | return MVT::i1; |
5560 | } |
5561 | return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements()); |
5562 | } |
5563 | |
5564 | MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { |
5565 | // TODO: Should i16 be used always if legal? For now it would force VALU |
5566 | // shifts. |
5567 | return (VT == MVT::i16) ? MVT::i16 : MVT::i32; |
5568 | } |
5569 | |
5570 | LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { |
5571 | return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) |
5572 | ? Ty.changeElementSize(NewEltSize: 16) |
5573 | : Ty.changeElementSize(NewEltSize: 32); |
5574 | } |
5575 | |
5576 | // Answering this is somewhat tricky and depends on the specific device which |
5577 | // have different rates for fma or all f64 operations. |
5578 | // |
5579 | // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other |
5580 | // regardless of which device (although the number of cycles differs between |
5581 | // devices), so it is always profitable for f64. |
5582 | // |
5583 | // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable |
5584 | // only on full rate devices. Normally, we should prefer selecting v_mad_f32 |
5585 | // which we can always do even without fused FP ops since it returns the same |
5586 | // result as the separate operations and since it is always full |
5587 | // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 |
5588 | // however does not support denormals, so we do report fma as faster if we have |
5589 | // a fast fma device and require denormals. |
5590 | // |
5591 | bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, |
5592 | EVT VT) const { |
5593 | VT = VT.getScalarType(); |
5594 | |
5595 | switch (VT.getSimpleVT().SimpleTy) { |
5596 | case MVT::f32: { |
5597 | // If mad is not available this depends only on if f32 fma is full rate. |
5598 | if (!Subtarget->hasMadMacF32Insts()) |
5599 | return Subtarget->hasFastFMAF32(); |
5600 | |
5601 | // Otherwise f32 mad is always full rate and returns the same result as |
5602 | // the separate operations so should be preferred over fma. |
5603 | // However does not support denormals. |
5604 | if (!denormalModeIsFlushAllF32(MF)) |
5605 | return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); |
5606 | |
5607 | // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. |
5608 | return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); |
5609 | } |
5610 | case MVT::f64: |
5611 | return true; |
5612 | case MVT::f16: |
5613 | return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); |
5614 | default: |
5615 | break; |
5616 | } |
5617 | |
5618 | return false; |
5619 | } |
5620 | |
5621 | bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, |
5622 | LLT Ty) const { |
5623 | switch (Ty.getScalarSizeInBits()) { |
5624 | case 16: |
5625 | return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16); |
5626 | case 32: |
5627 | return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32); |
5628 | case 64: |
5629 | return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64); |
5630 | default: |
5631 | break; |
5632 | } |
5633 | |
5634 | return false; |
5635 | } |
5636 | |
5637 | bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const { |
5638 | if (!Ty.isScalar()) |
5639 | return false; |
5640 | |
5641 | if (Ty.getScalarSizeInBits() == 16) |
5642 | return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF()); |
5643 | if (Ty.getScalarSizeInBits() == 32) |
5644 | return Subtarget->hasMadMacF32Insts() && |
5645 | denormalModeIsFlushAllF32(MF: *MI.getMF()); |
5646 | |
5647 | return false; |
5648 | } |
5649 | |
5650 | bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, |
5651 | const SDNode *N) const { |
5652 | // TODO: Check future ftz flag |
5653 | // v_mad_f32/v_mac_f32 do not support denormals. |
5654 | EVT VT = N->getValueType(ResNo: 0); |
5655 | if (VT == MVT::f32) |
5656 | return Subtarget->hasMadMacF32Insts() && |
5657 | denormalModeIsFlushAllF32(MF: DAG.getMachineFunction()); |
5658 | if (VT == MVT::f16) { |
5659 | return Subtarget->hasMadF16() && |
5660 | denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()); |
5661 | } |
5662 | |
5663 | return false; |
5664 | } |
5665 | |
5666 | //===----------------------------------------------------------------------===// |
5667 | // Custom DAG Lowering Operations |
5668 | //===----------------------------------------------------------------------===// |
5669 | |
5670 | // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the |
5671 | // wider vector type is legal. |
5672 | SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, |
5673 | SelectionDAG &DAG) const { |
5674 | unsigned Opc = Op.getOpcode(); |
5675 | EVT VT = Op.getValueType(); |
5676 | assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || |
5677 | VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || |
5678 | VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || |
5679 | VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); |
5680 | |
5681 | SDValue Lo, Hi; |
5682 | std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0); |
5683 | |
5684 | SDLoc SL(Op); |
5685 | SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo, |
5686 | Flags: Op->getFlags()); |
5687 | SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi, |
5688 | Flags: Op->getFlags()); |
5689 | |
5690 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi); |
5691 | } |
5692 | |
5693 | // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the |
5694 | // wider vector type is legal. |
5695 | SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, |
5696 | SelectionDAG &DAG) const { |
5697 | unsigned Opc = Op.getOpcode(); |
5698 | EVT VT = Op.getValueType(); |
5699 | assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || |
5700 | VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || |
5701 | VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || |
5702 | VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); |
5703 | |
5704 | SDValue Lo0, Hi0; |
5705 | std::tie(args&: Lo0, args&: Hi0) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0); |
5706 | SDValue Lo1, Hi1; |
5707 | std::tie(args&: Lo1, args&: Hi1) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1); |
5708 | |
5709 | SDLoc SL(Op); |
5710 | |
5711 | SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, |
5712 | Flags: Op->getFlags()); |
5713 | SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, |
5714 | Flags: Op->getFlags()); |
5715 | |
5716 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi); |
5717 | } |
5718 | |
5719 | SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, |
5720 | SelectionDAG &DAG) const { |
5721 | unsigned Opc = Op.getOpcode(); |
5722 | EVT VT = Op.getValueType(); |
5723 | assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || |
5724 | VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || |
5725 | VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || |
5726 | VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 || |
5727 | VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 || |
5728 | VT == MVT::v32bf16); |
5729 | |
5730 | SDValue Lo0, Hi0; |
5731 | SDValue Op0 = Op.getOperand(i: 0); |
5732 | std::tie(args&: Lo0, args&: Hi0) = Op0.getValueType().isVector() |
5733 | ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0) |
5734 | : std::pair(Op0, Op0); |
5735 | SDValue Lo1, Hi1; |
5736 | std::tie(args&: Lo1, args&: Hi1) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1); |
5737 | SDValue Lo2, Hi2; |
5738 | std::tie(args&: Lo2, args&: Hi2) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2); |
5739 | |
5740 | SDLoc SL(Op); |
5741 | auto ResVT = DAG.GetSplitDestVTs(VT); |
5742 | |
5743 | SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, |
5744 | Flags: Op->getFlags()); |
5745 | SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, |
5746 | Flags: Op->getFlags()); |
5747 | |
5748 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi); |
5749 | } |
5750 | |
5751 | |
5752 | SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { |
5753 | switch (Op.getOpcode()) { |
5754 | default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); |
5755 | case ISD::BRCOND: return LowerBRCOND(Op, DAG); |
5756 | case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); |
5757 | case ISD::LOAD: { |
5758 | SDValue Result = LowerLOAD(Op, DAG); |
5759 | assert((!Result.getNode() || |
5760 | Result.getNode()->getNumValues() == 2) && |
5761 | "Load should return a value and a chain" ); |
5762 | return Result; |
5763 | } |
5764 | case ISD::FSQRT: { |
5765 | EVT VT = Op.getValueType(); |
5766 | if (VT == MVT::f32) |
5767 | return lowerFSQRTF32(Op, DAG); |
5768 | if (VT == MVT::f64) |
5769 | return lowerFSQRTF64(Op, DAG); |
5770 | return SDValue(); |
5771 | } |
5772 | case ISD::FSIN: |
5773 | case ISD::FCOS: |
5774 | return LowerTrig(Op, DAG); |
5775 | case ISD::SELECT: return LowerSELECT(Op, DAG); |
5776 | case ISD::FDIV: return LowerFDIV(Op, DAG); |
5777 | case ISD::FFREXP: return LowerFFREXP(Op, DAG); |
5778 | case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); |
5779 | case ISD::STORE: return LowerSTORE(Op, DAG); |
5780 | case ISD::GlobalAddress: { |
5781 | MachineFunction &MF = DAG.getMachineFunction(); |
5782 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
5783 | return LowerGlobalAddress(MFI, Op, DAG); |
5784 | } |
5785 | case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); |
5786 | case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); |
5787 | case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); |
5788 | case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); |
5789 | case ISD::INSERT_SUBVECTOR: |
5790 | return lowerINSERT_SUBVECTOR(Op, DAG); |
5791 | case ISD::INSERT_VECTOR_ELT: |
5792 | return lowerINSERT_VECTOR_ELT(Op, DAG); |
5793 | case ISD::EXTRACT_VECTOR_ELT: |
5794 | return lowerEXTRACT_VECTOR_ELT(Op, DAG); |
5795 | case ISD::VECTOR_SHUFFLE: |
5796 | return lowerVECTOR_SHUFFLE(Op, DAG); |
5797 | case ISD::SCALAR_TO_VECTOR: |
5798 | return lowerSCALAR_TO_VECTOR(Op, DAG); |
5799 | case ISD::BUILD_VECTOR: |
5800 | return lowerBUILD_VECTOR(Op, DAG); |
5801 | case ISD::FP_ROUND: |
5802 | case ISD::STRICT_FP_ROUND: |
5803 | return lowerFP_ROUND(Op, DAG); |
5804 | case ISD::FPTRUNC_ROUND: { |
5805 | unsigned Opc; |
5806 | SDLoc DL(Op); |
5807 | |
5808 | if (Op.getOperand(i: 0)->getValueType(ResNo: 0) != MVT::f32) |
5809 | return SDValue(); |
5810 | |
5811 | // Get the rounding mode from the last operand |
5812 | int RoundMode = Op.getConstantOperandVal(i: 1); |
5813 | if (RoundMode == (int)RoundingMode::TowardPositive) |
5814 | Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD; |
5815 | else if (RoundMode == (int)RoundingMode::TowardNegative) |
5816 | Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD; |
5817 | else |
5818 | return SDValue(); |
5819 | |
5820 | return DAG.getNode(Opcode: Opc, DL, VTList: Op.getNode()->getVTList(), N: Op->getOperand(Num: 0)); |
5821 | } |
5822 | case ISD::TRAP: |
5823 | return lowerTRAP(Op, DAG); |
5824 | case ISD::DEBUGTRAP: |
5825 | return lowerDEBUGTRAP(Op, DAG); |
5826 | case ISD::ABS: |
5827 | case ISD::FABS: |
5828 | case ISD::FNEG: |
5829 | case ISD::FCANONICALIZE: |
5830 | case ISD::BSWAP: |
5831 | return splitUnaryVectorOp(Op, DAG); |
5832 | case ISD::FMINNUM: |
5833 | case ISD::FMAXNUM: |
5834 | return lowerFMINNUM_FMAXNUM(Op, DAG); |
5835 | case ISD::FLDEXP: |
5836 | case ISD::STRICT_FLDEXP: |
5837 | return lowerFLDEXP(Op, DAG); |
5838 | case ISD::FMA: |
5839 | return splitTernaryVectorOp(Op, DAG); |
5840 | case ISD::FP_TO_SINT: |
5841 | case ISD::FP_TO_UINT: |
5842 | return LowerFP_TO_INT(Op, DAG); |
5843 | case ISD::SHL: |
5844 | case ISD::SRA: |
5845 | case ISD::SRL: |
5846 | case ISD::ADD: |
5847 | case ISD::SUB: |
5848 | case ISD::SMIN: |
5849 | case ISD::SMAX: |
5850 | case ISD::UMIN: |
5851 | case ISD::UMAX: |
5852 | case ISD::FADD: |
5853 | case ISD::FMUL: |
5854 | case ISD::FMINNUM_IEEE: |
5855 | case ISD::FMAXNUM_IEEE: |
5856 | case ISD::FMINIMUM: |
5857 | case ISD::FMAXIMUM: |
5858 | case ISD::UADDSAT: |
5859 | case ISD::USUBSAT: |
5860 | case ISD::SADDSAT: |
5861 | case ISD::SSUBSAT: |
5862 | return splitBinaryVectorOp(Op, DAG); |
5863 | case ISD::MUL: |
5864 | return lowerMUL(Op, DAG); |
5865 | case ISD::SMULO: |
5866 | case ISD::UMULO: |
5867 | return lowerXMULO(Op, DAG); |
5868 | case ISD::SMUL_LOHI: |
5869 | case ISD::UMUL_LOHI: |
5870 | return lowerXMUL_LOHI(Op, DAG); |
5871 | case ISD::DYNAMIC_STACKALLOC: |
5872 | return LowerDYNAMIC_STACKALLOC(Op, DAG); |
5873 | case ISD::STACKSAVE: |
5874 | return LowerSTACKSAVE(Op, DAG); |
5875 | case ISD::GET_ROUNDING: |
5876 | return lowerGET_ROUNDING(Op, DAG); |
5877 | case ISD::SET_ROUNDING: |
5878 | return lowerSET_ROUNDING(Op, DAG); |
5879 | case ISD::PREFETCH: |
5880 | return lowerPREFETCH(Op, DAG); |
5881 | case ISD::FP_EXTEND: |
5882 | case ISD::STRICT_FP_EXTEND: |
5883 | return lowerFP_EXTEND(Op, DAG); |
5884 | case ISD::GET_FPENV: |
5885 | return lowerGET_FPENV(Op, DAG); |
5886 | case ISD::SET_FPENV: |
5887 | return lowerSET_FPENV(Op, DAG); |
5888 | } |
5889 | return SDValue(); |
5890 | } |
5891 | |
5892 | // Used for D16: Casts the result of an instruction into the right vector, |
5893 | // packs values if loads return unpacked values. |
5894 | static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, |
5895 | const SDLoc &DL, |
5896 | SelectionDAG &DAG, bool Unpacked) { |
5897 | if (!LoadVT.isVector()) |
5898 | return Result; |
5899 | |
5900 | // Cast back to the original packed type or to a larger type that is a |
5901 | // multiple of 32 bit for D16. Widening the return type is a required for |
5902 | // legalization. |
5903 | EVT FittingLoadVT = LoadVT; |
5904 | if ((LoadVT.getVectorNumElements() % 2) == 1) { |
5905 | FittingLoadVT = |
5906 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(), |
5907 | NumElements: LoadVT.getVectorNumElements() + 1); |
5908 | } |
5909 | |
5910 | if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. |
5911 | // Truncate to v2i16/v4i16. |
5912 | EVT IntLoadVT = FittingLoadVT.changeTypeToInteger(); |
5913 | |
5914 | // Workaround legalizer not scalarizing truncate after vector op |
5915 | // legalization but not creating intermediate vector trunc. |
5916 | SmallVector<SDValue, 4> Elts; |
5917 | DAG.ExtractVectorElements(Op: Result, Args&: Elts); |
5918 | for (SDValue &Elt : Elts) |
5919 | Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt); |
5920 | |
5921 | // Pad illegal v1i16/v3fi6 to v4i16 |
5922 | if ((LoadVT.getVectorNumElements() % 2) == 1) |
5923 | Elts.push_back(Elt: DAG.getUNDEF(VT: MVT::i16)); |
5924 | |
5925 | Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts); |
5926 | |
5927 | // Bitcast to original type (v2f16/v4f16). |
5928 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result); |
5929 | } |
5930 | |
5931 | // Cast back to the original packed type. |
5932 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result); |
5933 | } |
5934 | |
5935 | SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, |
5936 | MemSDNode *M, |
5937 | SelectionDAG &DAG, |
5938 | ArrayRef<SDValue> Ops, |
5939 | bool IsIntrinsic) const { |
5940 | SDLoc DL(M); |
5941 | |
5942 | bool Unpacked = Subtarget->hasUnpackedD16VMem(); |
5943 | EVT LoadVT = M->getValueType(ResNo: 0); |
5944 | |
5945 | EVT EquivLoadVT = LoadVT; |
5946 | if (LoadVT.isVector()) { |
5947 | if (Unpacked) { |
5948 | EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, |
5949 | NumElements: LoadVT.getVectorNumElements()); |
5950 | } else if ((LoadVT.getVectorNumElements() % 2) == 1) { |
5951 | // Widen v3f16 to legal type |
5952 | EquivLoadVT = |
5953 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(), |
5954 | NumElements: LoadVT.getVectorNumElements() + 1); |
5955 | } |
5956 | } |
5957 | |
5958 | // Change from v4f16/v2f16 to EquivLoadVT. |
5959 | SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other); |
5960 | |
5961 | SDValue Load |
5962 | = DAG.getMemIntrinsicNode( |
5963 | Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, |
5964 | VTList, Ops, MemVT: M->getMemoryVT(), |
5965 | MMO: M->getMemOperand()); |
5966 | |
5967 | SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked); |
5968 | |
5969 | return DAG.getMergeValues(Ops: { Adjusted, Load.getValue(R: 1) }, dl: DL); |
5970 | } |
5971 | |
5972 | SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, |
5973 | SelectionDAG &DAG, |
5974 | ArrayRef<SDValue> Ops) const { |
5975 | SDLoc DL(M); |
5976 | EVT LoadVT = M->getValueType(ResNo: 0); |
5977 | EVT EltType = LoadVT.getScalarType(); |
5978 | EVT IntVT = LoadVT.changeTypeToInteger(); |
5979 | |
5980 | bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); |
5981 | |
5982 | assert(M->getNumValues() == 2 || M->getNumValues() == 3); |
5983 | bool IsTFE = M->getNumValues() == 3; |
5984 | |
5985 | unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE |
5986 | : AMDGPUISD::BUFFER_LOAD_FORMAT) |
5987 | : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE |
5988 | : AMDGPUISD::BUFFER_LOAD; |
5989 | |
5990 | if (IsD16) { |
5991 | return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); |
5992 | } |
5993 | |
5994 | // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics |
5995 | if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) |
5996 | return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(), |
5997 | IsTFE); |
5998 | |
5999 | if (isTypeLegal(VT: LoadVT)) { |
6000 | return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT, |
6001 | MMO: M->getMemOperand(), DAG); |
6002 | } |
6003 | |
6004 | EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT); |
6005 | SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other); |
6006 | SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT, |
6007 | MMO: M->getMemOperand(), DAG); |
6008 | return DAG.getMergeValues( |
6009 | Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)}, |
6010 | dl: DL); |
6011 | } |
6012 | |
6013 | static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, |
6014 | SDNode *N, SelectionDAG &DAG) { |
6015 | EVT VT = N->getValueType(ResNo: 0); |
6016 | unsigned CondCode = N->getConstantOperandVal(Num: 3); |
6017 | if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode))) |
6018 | return DAG.getUNDEF(VT); |
6019 | |
6020 | ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); |
6021 | |
6022 | SDValue LHS = N->getOperand(Num: 1); |
6023 | SDValue RHS = N->getOperand(Num: 2); |
6024 | |
6025 | SDLoc DL(N); |
6026 | |
6027 | EVT CmpVT = LHS.getValueType(); |
6028 | if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) { |
6029 | unsigned PromoteOp = ICmpInst::isSigned(predicate: IcInput) ? |
6030 | ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
6031 | LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS); |
6032 | RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS); |
6033 | } |
6034 | |
6035 | ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput); |
6036 | |
6037 | unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); |
6038 | EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize); |
6039 | |
6040 | SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS, |
6041 | N3: DAG.getCondCode(Cond: CCOpcode)); |
6042 | if (VT.bitsEq(VT: CCVT)) |
6043 | return SetCC; |
6044 | return DAG.getZExtOrTrunc(Op: SetCC, DL, VT); |
6045 | } |
6046 | |
6047 | static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, |
6048 | SDNode *N, SelectionDAG &DAG) { |
6049 | EVT VT = N->getValueType(ResNo: 0); |
6050 | |
6051 | unsigned CondCode = N->getConstantOperandVal(Num: 3); |
6052 | if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode))) |
6053 | return DAG.getUNDEF(VT); |
6054 | |
6055 | SDValue Src0 = N->getOperand(Num: 1); |
6056 | SDValue Src1 = N->getOperand(Num: 2); |
6057 | EVT CmpVT = Src0.getValueType(); |
6058 | SDLoc SL(N); |
6059 | |
6060 | if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) { |
6061 | Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0); |
6062 | Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1); |
6063 | } |
6064 | |
6065 | FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); |
6066 | ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput); |
6067 | unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); |
6068 | EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize); |
6069 | SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, |
6070 | N2: Src1, N3: DAG.getCondCode(Cond: CCOpcode)); |
6071 | if (VT.bitsEq(VT: CCVT)) |
6072 | return SetCC; |
6073 | return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT); |
6074 | } |
6075 | |
6076 | static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, |
6077 | SelectionDAG &DAG) { |
6078 | EVT VT = N->getValueType(ResNo: 0); |
6079 | SDValue Src = N->getOperand(Num: 1); |
6080 | SDLoc SL(N); |
6081 | |
6082 | if (Src.getOpcode() == ISD::SETCC) { |
6083 | // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) |
6084 | return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Src.getOperand(i: 0), |
6085 | N2: Src.getOperand(i: 1), N3: Src.getOperand(i: 2)); |
6086 | } |
6087 | if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) { |
6088 | // (ballot 0) -> 0 |
6089 | if (Arg->isZero()) |
6090 | return DAG.getConstant(Val: 0, DL: SL, VT); |
6091 | |
6092 | // (ballot 1) -> EXEC/EXEC_LO |
6093 | if (Arg->isOne()) { |
6094 | Register Exec; |
6095 | if (VT.getScalarSizeInBits() == 32) |
6096 | Exec = AMDGPU::EXEC_LO; |
6097 | else if (VT.getScalarSizeInBits() == 64) |
6098 | Exec = AMDGPU::EXEC; |
6099 | else |
6100 | return SDValue(); |
6101 | |
6102 | return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT); |
6103 | } |
6104 | } |
6105 | |
6106 | // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0) |
6107 | // ISD::SETNE) |
6108 | return DAG.getNode( |
6109 | Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32), |
6110 | N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE)); |
6111 | } |
6112 | |
6113 | static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, |
6114 | SelectionDAG &DAG) { |
6115 | EVT VT = N->getValueType(ResNo: 0); |
6116 | unsigned ValSize = VT.getSizeInBits(); |
6117 | unsigned IID = N->getConstantOperandVal(Num: 0); |
6118 | bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || |
6119 | IID == Intrinsic::amdgcn_permlanex16; |
6120 | SDLoc SL(N); |
6121 | MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize); |
6122 | |
6123 | auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1, |
6124 | SDValue Src2, MVT ValT) -> SDValue { |
6125 | SmallVector<SDValue, 8> Operands; |
6126 | switch (IID) { |
6127 | case Intrinsic::amdgcn_permlane16: |
6128 | case Intrinsic::amdgcn_permlanex16: |
6129 | Operands.push_back(Elt: N->getOperand(Num: 6)); |
6130 | Operands.push_back(Elt: N->getOperand(Num: 5)); |
6131 | Operands.push_back(Elt: N->getOperand(Num: 4)); |
6132 | [[fallthrough]]; |
6133 | case Intrinsic::amdgcn_writelane: |
6134 | Operands.push_back(Elt: Src2); |
6135 | [[fallthrough]]; |
6136 | case Intrinsic::amdgcn_readlane: |
6137 | Operands.push_back(Elt: Src1); |
6138 | [[fallthrough]]; |
6139 | case Intrinsic::amdgcn_readfirstlane: |
6140 | case Intrinsic::amdgcn_permlane64: |
6141 | Operands.push_back(Elt: Src0); |
6142 | break; |
6143 | default: |
6144 | llvm_unreachable("unhandled lane op" ); |
6145 | } |
6146 | |
6147 | Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32)); |
6148 | std::reverse(first: Operands.begin(), last: Operands.end()); |
6149 | |
6150 | if (SDNode *GL = N->getGluedNode()) { |
6151 | assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); |
6152 | GL = GL->getOperand(Num: 0).getNode(); |
6153 | Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue, |
6154 | Operand: SDValue(GL, 0))); |
6155 | } |
6156 | |
6157 | return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands); |
6158 | }; |
6159 | |
6160 | SDValue Src0 = N->getOperand(Num: 1); |
6161 | SDValue Src1, Src2; |
6162 | if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || |
6163 | IsPermLane16) { |
6164 | Src1 = N->getOperand(Num: 2); |
6165 | if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) |
6166 | Src2 = N->getOperand(Num: 3); |
6167 | } |
6168 | |
6169 | if (ValSize == 32) { |
6170 | // Already legal |
6171 | return SDValue(); |
6172 | } |
6173 | |
6174 | if (ValSize < 32) { |
6175 | bool IsFloat = VT.isFloatingPoint(); |
6176 | Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0, |
6177 | DL: SL, VT: MVT::i32); |
6178 | |
6179 | if (IsPermLane16) { |
6180 | Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1, |
6181 | DL: SL, VT: MVT::i32); |
6182 | } |
6183 | |
6184 | if (IID == Intrinsic::amdgcn_writelane) { |
6185 | Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2, |
6186 | DL: SL, VT: MVT::i32); |
6187 | } |
6188 | |
6189 | SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32); |
6190 | SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT); |
6191 | return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc; |
6192 | } |
6193 | |
6194 | if (ValSize % 32 != 0) |
6195 | return SDValue(); |
6196 | |
6197 | auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { |
6198 | EVT VT = N->getValueType(ResNo: 0); |
6199 | unsigned NE = VT.getVectorNumElements(); |
6200 | EVT EltVT = VT.getVectorElementType(); |
6201 | SmallVector<SDValue, 8> Scalars; |
6202 | unsigned NumOperands = N->getNumOperands(); |
6203 | SmallVector<SDValue, 4> Operands(NumOperands); |
6204 | SDNode *GL = N->getGluedNode(); |
6205 | |
6206 | // only handle convergencectrl_glue |
6207 | assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); |
6208 | |
6209 | for (unsigned i = 0; i != NE; ++i) { |
6210 | for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e; |
6211 | ++j) { |
6212 | SDValue Operand = N->getOperand(Num: j); |
6213 | EVT OperandVT = Operand.getValueType(); |
6214 | if (OperandVT.isVector()) { |
6215 | // A vector operand; extract a single element. |
6216 | EVT OperandEltVT = OperandVT.getVectorElementType(); |
6217 | Operands[j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT, |
6218 | N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL)); |
6219 | } else { |
6220 | // A scalar operand; just use it as is. |
6221 | Operands[j] = Operand; |
6222 | } |
6223 | } |
6224 | |
6225 | if (GL) |
6226 | Operands[NumOperands - 1] = |
6227 | DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue, |
6228 | Operand: SDValue(GL->getOperand(Num: 0).getNode(), 0)); |
6229 | |
6230 | Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands)); |
6231 | } |
6232 | |
6233 | EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE); |
6234 | return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars); |
6235 | }; |
6236 | |
6237 | if (VT.isVector()) { |
6238 | switch (MVT::SimpleValueType EltTy = |
6239 | VT.getVectorElementType().getSimpleVT().SimpleTy) { |
6240 | case MVT::i32: |
6241 | case MVT::f32: { |
6242 | SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT()); |
6243 | return unrollLaneOp(LaneOp.getNode()); |
6244 | } |
6245 | case MVT::i16: |
6246 | case MVT::f16: |
6247 | case MVT::bf16: { |
6248 | MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: 2); |
6249 | SmallVector<SDValue, 4> Pieces; |
6250 | SDValue Src0SubVec, Src1SubVec, Src2SubVec; |
6251 | for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) { |
6252 | Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0, |
6253 | N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32)); |
6254 | |
6255 | if (IsPermLane16) |
6256 | Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1, |
6257 | N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32)); |
6258 | |
6259 | if (IID == Intrinsic::amdgcn_writelane) |
6260 | Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2, |
6261 | N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32)); |
6262 | |
6263 | Pieces.push_back( |
6264 | Elt: IsPermLane16 |
6265 | ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT) |
6266 | : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT)); |
6267 | EltIdx += 2; |
6268 | } |
6269 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces); |
6270 | } |
6271 | default: |
6272 | // Handle all other cases by bitcasting to i32 vectors |
6273 | break; |
6274 | } |
6275 | } |
6276 | |
6277 | MVT VecVT = MVT::getVectorVT(VT: MVT::i32, NumElements: ValSize / 32); |
6278 | Src0 = DAG.getBitcast(VT: VecVT, V: Src0); |
6279 | |
6280 | if (IsPermLane16) |
6281 | Src1 = DAG.getBitcast(VT: VecVT, V: Src1); |
6282 | |
6283 | if (IID == Intrinsic::amdgcn_writelane) |
6284 | Src2 = DAG.getBitcast(VT: VecVT, V: Src2); |
6285 | |
6286 | SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT); |
6287 | SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode()); |
6288 | return DAG.getBitcast(VT, V: UnrolledLaneOp); |
6289 | } |
6290 | |
6291 | void SITargetLowering::ReplaceNodeResults(SDNode *N, |
6292 | SmallVectorImpl<SDValue> &Results, |
6293 | SelectionDAG &DAG) const { |
6294 | switch (N->getOpcode()) { |
6295 | case ISD::INSERT_VECTOR_ELT: { |
6296 | if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG)) |
6297 | Results.push_back(Elt: Res); |
6298 | return; |
6299 | } |
6300 | case ISD::EXTRACT_VECTOR_ELT: { |
6301 | if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG)) |
6302 | Results.push_back(Elt: Res); |
6303 | return; |
6304 | } |
6305 | case ISD::INTRINSIC_WO_CHAIN: { |
6306 | unsigned IID = N->getConstantOperandVal(Num: 0); |
6307 | switch (IID) { |
6308 | case Intrinsic::amdgcn_make_buffer_rsrc: |
6309 | Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG)); |
6310 | return; |
6311 | case Intrinsic::amdgcn_cvt_pkrtz: { |
6312 | SDValue Src0 = N->getOperand(Num: 1); |
6313 | SDValue Src1 = N->getOperand(Num: 2); |
6314 | SDLoc SL(N); |
6315 | SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32, |
6316 | N1: Src0, N2: Src1); |
6317 | Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt)); |
6318 | return; |
6319 | } |
6320 | case Intrinsic::amdgcn_cvt_pknorm_i16: |
6321 | case Intrinsic::amdgcn_cvt_pknorm_u16: |
6322 | case Intrinsic::amdgcn_cvt_pk_i16: |
6323 | case Intrinsic::amdgcn_cvt_pk_u16: { |
6324 | SDValue Src0 = N->getOperand(Num: 1); |
6325 | SDValue Src1 = N->getOperand(Num: 2); |
6326 | SDLoc SL(N); |
6327 | unsigned Opcode; |
6328 | |
6329 | if (IID == Intrinsic::amdgcn_cvt_pknorm_i16) |
6330 | Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; |
6331 | else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16) |
6332 | Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; |
6333 | else if (IID == Intrinsic::amdgcn_cvt_pk_i16) |
6334 | Opcode = AMDGPUISD::CVT_PK_I16_I32; |
6335 | else |
6336 | Opcode = AMDGPUISD::CVT_PK_U16_U32; |
6337 | |
6338 | EVT VT = N->getValueType(ResNo: 0); |
6339 | if (isTypeLegal(VT)) |
6340 | Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1)); |
6341 | else { |
6342 | SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1); |
6343 | Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt)); |
6344 | } |
6345 | return; |
6346 | } |
6347 | case Intrinsic::amdgcn_s_buffer_load: { |
6348 | // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate |
6349 | // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG |
6350 | // combiner tries to merge the s_buffer_load_u8 with a sext instruction |
6351 | // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with |
6352 | // s_buffer_load_i8. |
6353 | if (!Subtarget->hasScalarSubwordLoads()) |
6354 | return; |
6355 | SDValue Op = SDValue(N, 0); |
6356 | SDValue Rsrc = Op.getOperand(i: 1); |
6357 | SDValue Offset = Op.getOperand(i: 2); |
6358 | SDValue CachePolicy = Op.getOperand(i: 3); |
6359 | EVT VT = Op.getValueType(); |
6360 | assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n" ); |
6361 | SDLoc DL(Op); |
6362 | MachineFunction &MF = DAG.getMachineFunction(); |
6363 | const DataLayout &DataLayout = DAG.getDataLayout(); |
6364 | Align Alignment = |
6365 | DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext())); |
6366 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
6367 | PtrInfo: MachinePointerInfo(), |
6368 | F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
6369 | MachineMemOperand::MOInvariant, |
6370 | Size: VT.getStoreSize(), BaseAlignment: Alignment); |
6371 | SDValue LoadVal; |
6372 | if (!Offset->isDivergent()) { |
6373 | SDValue Ops[] = {Rsrc, // source register |
6374 | Offset, CachePolicy}; |
6375 | SDValue BufferLoad = |
6376 | DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL, |
6377 | VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO); |
6378 | LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad); |
6379 | } else { |
6380 | SDValue Ops[] = { |
6381 | DAG.getEntryNode(), // Chain |
6382 | Rsrc, // rsrc |
6383 | DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex |
6384 | {}, // voffset |
6385 | {}, // soffset |
6386 | {}, // offset |
6387 | CachePolicy, // cachepolicy |
6388 | DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen |
6389 | }; |
6390 | setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4)); |
6391 | LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO); |
6392 | } |
6393 | Results.push_back(Elt: LoadVal); |
6394 | return; |
6395 | } |
6396 | } |
6397 | break; |
6398 | } |
6399 | case ISD::INTRINSIC_W_CHAIN: { |
6400 | if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) { |
6401 | if (Res.getOpcode() == ISD::MERGE_VALUES) { |
6402 | // FIXME: Hacky |
6403 | for (unsigned I = 0; I < Res.getNumOperands(); I++) { |
6404 | Results.push_back(Elt: Res.getOperand(i: I)); |
6405 | } |
6406 | } else { |
6407 | Results.push_back(Elt: Res); |
6408 | Results.push_back(Elt: Res.getValue(R: 1)); |
6409 | } |
6410 | return; |
6411 | } |
6412 | |
6413 | break; |
6414 | } |
6415 | case ISD::SELECT: { |
6416 | SDLoc SL(N); |
6417 | EVT VT = N->getValueType(ResNo: 0); |
6418 | EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT); |
6419 | SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1)); |
6420 | SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2)); |
6421 | |
6422 | EVT SelectVT = NewVT; |
6423 | if (NewVT.bitsLT(VT: MVT::i32)) { |
6424 | LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS); |
6425 | RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS); |
6426 | SelectVT = MVT::i32; |
6427 | } |
6428 | |
6429 | SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, |
6430 | N1: N->getOperand(Num: 0), N2: LHS, N3: RHS); |
6431 | |
6432 | if (NewVT != SelectVT) |
6433 | NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect); |
6434 | Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect)); |
6435 | return; |
6436 | } |
6437 | case ISD::FNEG: { |
6438 | if (N->getValueType(ResNo: 0) != MVT::v2f16) |
6439 | break; |
6440 | |
6441 | SDLoc SL(N); |
6442 | SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0)); |
6443 | |
6444 | SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, |
6445 | N1: BC, |
6446 | N2: DAG.getConstant(Val: 0x80008000, DL: SL, VT: MVT::i32)); |
6447 | Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op)); |
6448 | return; |
6449 | } |
6450 | case ISD::FABS: { |
6451 | if (N->getValueType(ResNo: 0) != MVT::v2f16) |
6452 | break; |
6453 | |
6454 | SDLoc SL(N); |
6455 | SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0)); |
6456 | |
6457 | SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, |
6458 | N1: BC, |
6459 | N2: DAG.getConstant(Val: 0x7fff7fff, DL: SL, VT: MVT::i32)); |
6460 | Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op)); |
6461 | return; |
6462 | } |
6463 | case ISD::FSQRT: { |
6464 | if (N->getValueType(ResNo: 0) != MVT::f16) |
6465 | break; |
6466 | Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG)); |
6467 | break; |
6468 | } |
6469 | default: |
6470 | AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); |
6471 | break; |
6472 | } |
6473 | } |
6474 | |
6475 | /// Helper function for LowerBRCOND |
6476 | static SDNode *findUser(SDValue Value, unsigned Opcode) { |
6477 | |
6478 | SDNode *Parent = Value.getNode(); |
6479 | for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); |
6480 | I != E; ++I) { |
6481 | |
6482 | if (I.getUse().get() != Value) |
6483 | continue; |
6484 | |
6485 | if (I->getOpcode() == Opcode) |
6486 | return *I; |
6487 | } |
6488 | return nullptr; |
6489 | } |
6490 | |
6491 | unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { |
6492 | if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { |
6493 | switch (Intr->getConstantOperandVal(Num: 1)) { |
6494 | case Intrinsic::amdgcn_if: |
6495 | return AMDGPUISD::IF; |
6496 | case Intrinsic::amdgcn_else: |
6497 | return AMDGPUISD::ELSE; |
6498 | case Intrinsic::amdgcn_loop: |
6499 | return AMDGPUISD::LOOP; |
6500 | case Intrinsic::amdgcn_end_cf: |
6501 | llvm_unreachable("should not occur" ); |
6502 | default: |
6503 | return 0; |
6504 | } |
6505 | } |
6506 | |
6507 | // break, if_break, else_break are all only used as inputs to loop, not |
6508 | // directly as branch conditions. |
6509 | return 0; |
6510 | } |
6511 | |
6512 | bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { |
6513 | const Triple &TT = getTargetMachine().getTargetTriple(); |
6514 | return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || |
6515 | GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && |
6516 | AMDGPU::shouldEmitConstantsToTextSection(TT); |
6517 | } |
6518 | |
6519 | bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { |
6520 | if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) |
6521 | return false; |
6522 | |
6523 | // FIXME: Either avoid relying on address space here or change the default |
6524 | // address space for functions to avoid the explicit check. |
6525 | return (GV->getValueType()->isFunctionTy() || |
6526 | !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) && |
6527 | !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV); |
6528 | } |
6529 | |
6530 | bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { |
6531 | return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); |
6532 | } |
6533 | |
6534 | bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { |
6535 | if (!GV->hasExternalLinkage()) |
6536 | return true; |
6537 | |
6538 | const auto OS = getTargetMachine().getTargetTriple().getOS(); |
6539 | return OS == Triple::AMDHSA || OS == Triple::AMDPAL; |
6540 | } |
6541 | |
6542 | /// This transforms the control flow intrinsics to get the branch destination as |
6543 | /// last parameter, also switches branch target with BR if the need arise |
6544 | SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, |
6545 | SelectionDAG &DAG) const { |
6546 | SDLoc DL(BRCOND); |
6547 | |
6548 | SDNode *Intr = BRCOND.getOperand(i: 1).getNode(); |
6549 | SDValue Target = BRCOND.getOperand(i: 2); |
6550 | SDNode *BR = nullptr; |
6551 | SDNode *SetCC = nullptr; |
6552 | |
6553 | if (Intr->getOpcode() == ISD::SETCC) { |
6554 | // As long as we negate the condition everything is fine |
6555 | SetCC = Intr; |
6556 | Intr = SetCC->getOperand(Num: 0).getNode(); |
6557 | |
6558 | } else { |
6559 | // Get the target from BR if we don't negate the condition |
6560 | BR = findUser(Value: BRCOND, Opcode: ISD::BR); |
6561 | assert(BR && "brcond missing unconditional branch user" ); |
6562 | Target = BR->getOperand(Num: 1); |
6563 | } |
6564 | |
6565 | unsigned CFNode = isCFIntrinsic(Intr); |
6566 | if (CFNode == 0) { |
6567 | // This is a uniform branch so we don't need to legalize. |
6568 | return BRCOND; |
6569 | } |
6570 | |
6571 | bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || |
6572 | Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; |
6573 | |
6574 | assert(!SetCC || |
6575 | (SetCC->getConstantOperandVal(1) == 1 && |
6576 | cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == |
6577 | ISD::SETNE)); |
6578 | |
6579 | // operands of the new intrinsic call |
6580 | SmallVector<SDValue, 4> Ops; |
6581 | if (HaveChain) |
6582 | Ops.push_back(Elt: BRCOND.getOperand(i: 0)); |
6583 | |
6584 | Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end()); |
6585 | Ops.push_back(Elt: Target); |
6586 | |
6587 | ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); |
6588 | |
6589 | // build the new intrinsic call |
6590 | SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode(); |
6591 | |
6592 | if (!HaveChain) { |
6593 | SDValue Ops[] = { |
6594 | SDValue(Result, 0), |
6595 | BRCOND.getOperand(i: 0) |
6596 | }; |
6597 | |
6598 | Result = DAG.getMergeValues(Ops, dl: DL).getNode(); |
6599 | } |
6600 | |
6601 | if (BR) { |
6602 | // Give the branch instruction our target |
6603 | SDValue Ops[] = { |
6604 | BR->getOperand(Num: 0), |
6605 | BRCOND.getOperand(i: 2) |
6606 | }; |
6607 | SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops); |
6608 | DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode()); |
6609 | } |
6610 | |
6611 | SDValue Chain = SDValue(Result, Result->getNumValues() - 1); |
6612 | |
6613 | // Copy the intrinsic results to registers |
6614 | for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { |
6615 | SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg); |
6616 | if (!CopyToReg) |
6617 | continue; |
6618 | |
6619 | Chain = DAG.getCopyToReg( |
6620 | Chain, dl: DL, |
6621 | Reg: CopyToReg->getOperand(Num: 1), |
6622 | N: SDValue(Result, i - 1), |
6623 | Glue: SDValue()); |
6624 | |
6625 | DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0)); |
6626 | } |
6627 | |
6628 | // Remove the old intrinsic from the chain |
6629 | DAG.ReplaceAllUsesOfValueWith( |
6630 | From: SDValue(Intr, Intr->getNumValues() - 1), |
6631 | To: Intr->getOperand(Num: 0)); |
6632 | |
6633 | return Chain; |
6634 | } |
6635 | |
6636 | SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, |
6637 | SelectionDAG &DAG) const { |
6638 | MVT VT = Op.getSimpleValueType(); |
6639 | SDLoc DL(Op); |
6640 | // Checking the depth |
6641 | if (Op.getConstantOperandVal(i: 0) != 0) |
6642 | return DAG.getConstant(Val: 0, DL, VT); |
6643 | |
6644 | MachineFunction &MF = DAG.getMachineFunction(); |
6645 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
6646 | // Check for kernel and shader functions |
6647 | if (Info->isEntryFunction()) |
6648 | return DAG.getConstant(Val: 0, DL, VT); |
6649 | |
6650 | MachineFrameInfo &MFI = MF.getFrameInfo(); |
6651 | // There is a call to @llvm.returnaddress in this function |
6652 | MFI.setReturnAddressIsTaken(true); |
6653 | |
6654 | const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); |
6655 | // Get the return address reg and mark it as an implicit live-in |
6656 | Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF), RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent())); |
6657 | |
6658 | return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT); |
6659 | } |
6660 | |
6661 | SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, |
6662 | SDValue Op, |
6663 | const SDLoc &DL, |
6664 | EVT VT) const { |
6665 | return Op.getValueType().bitsLE(VT) ? |
6666 | DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op) : |
6667 | DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op, |
6668 | N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)); |
6669 | } |
6670 | |
6671 | SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { |
6672 | assert(Op.getValueType() == MVT::f16 && |
6673 | "Do not know how to custom lower FP_ROUND for non-f16 type" ); |
6674 | |
6675 | SDValue Src = Op.getOperand(i: 0); |
6676 | EVT SrcVT = Src.getValueType(); |
6677 | if (SrcVT != MVT::f64) |
6678 | return Op; |
6679 | |
6680 | // TODO: Handle strictfp |
6681 | if (Op.getOpcode() != ISD::FP_ROUND) |
6682 | return Op; |
6683 | |
6684 | SDLoc DL(Op); |
6685 | |
6686 | SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src); |
6687 | SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16); |
6688 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc); |
6689 | } |
6690 | |
6691 | SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, |
6692 | SelectionDAG &DAG) const { |
6693 | EVT VT = Op.getValueType(); |
6694 | const MachineFunction &MF = DAG.getMachineFunction(); |
6695 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
6696 | bool IsIEEEMode = Info->getMode().IEEE; |
6697 | |
6698 | // FIXME: Assert during selection that this is only selected for |
6699 | // ieee_mode. Currently a combine can produce the ieee version for non-ieee |
6700 | // mode functions, but this happens to be OK since it's only done in cases |
6701 | // where there is known no sNaN. |
6702 | if (IsIEEEMode) |
6703 | return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG); |
6704 | |
6705 | if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || |
6706 | VT == MVT::v16bf16) |
6707 | return splitBinaryVectorOp(Op, DAG); |
6708 | return Op; |
6709 | } |
6710 | |
6711 | SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { |
6712 | bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; |
6713 | EVT VT = Op.getValueType(); |
6714 | assert(VT == MVT::f16); |
6715 | |
6716 | SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1); |
6717 | EVT ExpVT = Exp.getValueType(); |
6718 | if (ExpVT == MVT::i16) |
6719 | return Op; |
6720 | |
6721 | SDLoc DL(Op); |
6722 | |
6723 | // Correct the exponent type for f16 to i16. |
6724 | // Clamp the range of the exponent to the instruction's range. |
6725 | |
6726 | // TODO: This should be a generic narrowing legalization, and can easily be |
6727 | // for GlobalISel. |
6728 | |
6729 | SDValue MinExp = DAG.getConstant(Val: minIntN(N: 16), DL, VT: ExpVT); |
6730 | SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp); |
6731 | |
6732 | SDValue MaxExp = DAG.getConstant(Val: maxIntN(N: 16), DL, VT: ExpVT); |
6733 | SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp); |
6734 | |
6735 | SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp); |
6736 | |
6737 | if (IsStrict) { |
6738 | return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other}, |
6739 | Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), TruncExp}); |
6740 | } |
6741 | |
6742 | return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp); |
6743 | } |
6744 | |
6745 | // Custom lowering for vector multiplications and s_mul_u64. |
6746 | SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { |
6747 | EVT VT = Op.getValueType(); |
6748 | |
6749 | // Split vector operands. |
6750 | if (VT.isVector()) |
6751 | return splitBinaryVectorOp(Op, DAG); |
6752 | |
6753 | assert(VT == MVT::i64 && "The following code is a special for s_mul_u64" ); |
6754 | |
6755 | // There are four ways to lower s_mul_u64: |
6756 | // |
6757 | // 1. If all the operands are uniform, then we lower it as it is. |
6758 | // |
6759 | // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit |
6760 | // multiplications because there is not a vector equivalent of s_mul_u64. |
6761 | // |
6762 | // 3. If the cost model decides that it is more efficient to use vector |
6763 | // registers, then we have to split s_mul_u64 in 32-bit multiplications. |
6764 | // This happens in splitScalarSMULU64() in SIInstrInfo.cpp . |
6765 | // |
6766 | // 4. If the cost model decides to use vector registers and both of the |
6767 | // operands are zero-extended/sign-extended from 32-bits, then we split the |
6768 | // s_mul_u64 in two 32-bit multiplications. The problem is that it is not |
6769 | // possible to check if the operands are zero-extended or sign-extended in |
6770 | // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with |
6771 | // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace |
6772 | // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. |
6773 | // If the cost model decides that we have to use vector registers, then |
6774 | // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/ |
6775 | // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model |
6776 | // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/ |
6777 | // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in |
6778 | // SIInstrInfo.cpp . |
6779 | |
6780 | if (Op->isDivergent()) |
6781 | return SDValue(); |
6782 | |
6783 | SDValue Op0 = Op.getOperand(i: 0); |
6784 | SDValue Op1 = Op.getOperand(i: 1); |
6785 | // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 |
6786 | // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to |
6787 | // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. |
6788 | KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0); |
6789 | unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros(); |
6790 | KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1); |
6791 | unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros(); |
6792 | SDLoc SL(Op); |
6793 | if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32) |
6794 | return SDValue( |
6795 | DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0); |
6796 | unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0); |
6797 | unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1); |
6798 | if (Op0SignBits >= 33 && Op1SignBits >= 33) |
6799 | return SDValue( |
6800 | DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0); |
6801 | // If all the operands are uniform, then we lower s_mul_u64 as it is. |
6802 | return Op; |
6803 | } |
6804 | |
6805 | SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { |
6806 | EVT VT = Op.getValueType(); |
6807 | SDLoc SL(Op); |
6808 | SDValue LHS = Op.getOperand(i: 0); |
6809 | SDValue RHS = Op.getOperand(i: 1); |
6810 | bool isSigned = Op.getOpcode() == ISD::SMULO; |
6811 | |
6812 | if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) { |
6813 | const APInt &C = RHSC->getAPIntValue(); |
6814 | // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X } |
6815 | if (C.isPowerOf2()) { |
6816 | // smulo(x, signed_min) is same as umulo(x, signed_min). |
6817 | bool UseArithShift = isSigned && !C.isMinSignedValue(); |
6818 | SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32); |
6819 | SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt); |
6820 | SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, |
6821 | LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL, |
6822 | DL: SL, VT, N1: Result, N2: ShiftAmt), |
6823 | RHS: LHS, Cond: ISD::SETNE); |
6824 | return DAG.getMergeValues(Ops: { Result, Overflow }, dl: SL); |
6825 | } |
6826 | } |
6827 | |
6828 | SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS); |
6829 | SDValue Top = DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, |
6830 | DL: SL, VT, N1: LHS, N2: RHS); |
6831 | |
6832 | SDValue Sign = isSigned |
6833 | ? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result, |
6834 | N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - 1, DL: SL, VT: MVT::i32)) |
6835 | : DAG.getConstant(Val: 0, DL: SL, VT); |
6836 | SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE); |
6837 | |
6838 | return DAG.getMergeValues(Ops: { Result, Overflow }, dl: SL); |
6839 | } |
6840 | |
6841 | SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const { |
6842 | if (Op->isDivergent()) { |
6843 | // Select to V_MAD_[IU]64_[IU]32. |
6844 | return Op; |
6845 | } |
6846 | if (Subtarget->hasSMulHi()) { |
6847 | // Expand to S_MUL_I32 + S_MUL_HI_[IU]32. |
6848 | return SDValue(); |
6849 | } |
6850 | // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to |
6851 | // calculate the high part, so we might as well do the whole thing with |
6852 | // V_MAD_[IU]64_[IU]32. |
6853 | return Op; |
6854 | } |
6855 | |
6856 | SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { |
6857 | if (!Subtarget->isTrapHandlerEnabled() || |
6858 | Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) |
6859 | return lowerTrapEndpgm(Op, DAG); |
6860 | |
6861 | return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : |
6862 | lowerTrapHsaQueuePtr(Op, DAG); |
6863 | } |
6864 | |
6865 | SDValue SITargetLowering::lowerTrapEndpgm( |
6866 | SDValue Op, SelectionDAG &DAG) const { |
6867 | SDLoc SL(Op); |
6868 | SDValue Chain = Op.getOperand(i: 0); |
6869 | return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain); |
6870 | } |
6871 | |
6872 | SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, |
6873 | const SDLoc &DL, Align Alignment, ImplicitParameter Param) const { |
6874 | MachineFunction &MF = DAG.getMachineFunction(); |
6875 | uint64_t Offset = getImplicitParameterOffset(MF, Param); |
6876 | SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset); |
6877 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
6878 | return DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo, Alignment, |
6879 | MMOFlags: MachineMemOperand::MODereferenceable | |
6880 | MachineMemOperand::MOInvariant); |
6881 | } |
6882 | |
6883 | SDValue SITargetLowering::lowerTrapHsaQueuePtr( |
6884 | SDValue Op, SelectionDAG &DAG) const { |
6885 | SDLoc SL(Op); |
6886 | SDValue Chain = Op.getOperand(i: 0); |
6887 | |
6888 | SDValue QueuePtr; |
6889 | // For code object version 5, QueuePtr is passed through implicit kernarg. |
6890 | const Module *M = DAG.getMachineFunction().getFunction().getParent(); |
6891 | if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) { |
6892 | QueuePtr = |
6893 | loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align(8), Param: QUEUE_PTR); |
6894 | } else { |
6895 | MachineFunction &MF = DAG.getMachineFunction(); |
6896 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
6897 | Register UserSGPR = Info->getQueuePtrUserSGPR(); |
6898 | |
6899 | if (UserSGPR == AMDGPU::NoRegister) { |
6900 | // We probably are in a function incorrectly marked with |
6901 | // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the |
6902 | // trap, so just use a null pointer. |
6903 | QueuePtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64); |
6904 | } else { |
6905 | QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, |
6906 | VT: MVT::i64); |
6907 | } |
6908 | } |
6909 | |
6910 | SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64); |
6911 | SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, |
6912 | N: QueuePtr, Glue: SDValue()); |
6913 | |
6914 | uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); |
6915 | SDValue Ops[] = { |
6916 | ToReg, |
6917 | DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16), |
6918 | SGPR01, |
6919 | ToReg.getValue(R: 1) |
6920 | }; |
6921 | return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops); |
6922 | } |
6923 | |
6924 | SDValue SITargetLowering::lowerTrapHsa( |
6925 | SDValue Op, SelectionDAG &DAG) const { |
6926 | SDLoc SL(Op); |
6927 | SDValue Chain = Op.getOperand(i: 0); |
6928 | |
6929 | // We need to simulate the 's_trap 2' instruction on targets that run in |
6930 | // PRIV=1 (where it is treated as a nop). |
6931 | if (Subtarget->hasPrivEnabledTrap2NopBug()) |
6932 | return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain); |
6933 | |
6934 | uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); |
6935 | SDValue Ops[] = { |
6936 | Chain, |
6937 | DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16) |
6938 | }; |
6939 | return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops); |
6940 | } |
6941 | |
6942 | SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { |
6943 | SDLoc SL(Op); |
6944 | SDValue Chain = Op.getOperand(i: 0); |
6945 | MachineFunction &MF = DAG.getMachineFunction(); |
6946 | |
6947 | if (!Subtarget->isTrapHandlerEnabled() || |
6948 | Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { |
6949 | DiagnosticInfoUnsupported NoTrap(MF.getFunction(), |
6950 | "debugtrap handler not supported" , |
6951 | Op.getDebugLoc(), |
6952 | DS_Warning); |
6953 | LLVMContext &Ctx = MF.getFunction().getContext(); |
6954 | Ctx.diagnose(DI: NoTrap); |
6955 | return Chain; |
6956 | } |
6957 | |
6958 | uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap); |
6959 | SDValue Ops[] = { |
6960 | Chain, |
6961 | DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16) |
6962 | }; |
6963 | return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops); |
6964 | } |
6965 | |
6966 | SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, |
6967 | SelectionDAG &DAG) const { |
6968 | if (Subtarget->hasApertureRegs()) { |
6969 | const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) |
6970 | ? AMDGPU::SRC_SHARED_BASE |
6971 | : AMDGPU::SRC_PRIVATE_BASE; |
6972 | // Note: this feature (register) is broken. When used as a 32-bit operand, |
6973 | // it returns a wrong value (all zeroes?). The real value is in the upper 32 |
6974 | // bits. |
6975 | // |
6976 | // To work around the issue, directly emit a 64 bit mov from this register |
6977 | // then extract the high bits. Note that this shouldn't even result in a |
6978 | // shift being emitted and simply become a pair of registers (e.g.): |
6979 | // s_mov_b64 s[6:7], src_shared_base |
6980 | // v_mov_b32_e32 v1, s7 |
6981 | // |
6982 | // FIXME: It would be more natural to emit a CopyFromReg here, but then copy |
6983 | // coalescing would kick in and it would think it's okay to use the "HI" |
6984 | // subregister directly (instead of extracting the HI 32 bits) which is an |
6985 | // artificial (unusable) register. |
6986 | // Register TableGen definitions would need an overhaul to get rid of the |
6987 | // artificial "HI" aperture registers and prevent this kind of issue from |
6988 | // happening. |
6989 | SDNode *Mov = DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64, |
6990 | Op1: DAG.getRegister(Reg: ApertureRegNo, VT: MVT::i64)); |
6991 | return DAG.getNode( |
6992 | Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, |
6993 | Operand: DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, |
6994 | Ops: {SDValue(Mov, 0), DAG.getConstant(Val: 32, DL, VT: MVT::i64)})); |
6995 | } |
6996 | |
6997 | // For code object version 5, private_base and shared_base are passed through |
6998 | // implicit kernargs. |
6999 | const Module *M = DAG.getMachineFunction().getFunction().getParent(); |
7000 | if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) { |
7001 | ImplicitParameter Param = |
7002 | (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; |
7003 | return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align(4), Param); |
7004 | } |
7005 | |
7006 | MachineFunction &MF = DAG.getMachineFunction(); |
7007 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
7008 | Register UserSGPR = Info->getQueuePtrUserSGPR(); |
7009 | if (UserSGPR == AMDGPU::NoRegister) { |
7010 | // We probably are in a function incorrectly marked with |
7011 | // amdgpu-no-queue-ptr. This is undefined. |
7012 | return DAG.getUNDEF(VT: MVT::i32); |
7013 | } |
7014 | |
7015 | SDValue QueuePtr = CreateLiveInRegister( |
7016 | DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64); |
7017 | |
7018 | // Offset into amd_queue_t for group_segment_aperture_base_hi / |
7019 | // private_segment_aperture_base_hi. |
7020 | uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; |
7021 | |
7022 | SDValue Ptr = |
7023 | DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset)); |
7024 | |
7025 | // TODO: Use custom target PseudoSourceValue. |
7026 | // TODO: We should use the value from the IR intrinsic call, but it might not |
7027 | // be available and how do we get it? |
7028 | MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); |
7029 | return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: 1), Ptr, PtrInfo, |
7030 | Alignment: commonAlignment(A: Align(64), Offset: StructOffset), |
7031 | MMOFlags: MachineMemOperand::MODereferenceable | |
7032 | MachineMemOperand::MOInvariant); |
7033 | } |
7034 | |
7035 | /// Return true if the value is a known valid address, such that a null check is |
7036 | /// not necessary. |
7037 | static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG, |
7038 | const AMDGPUTargetMachine &TM, unsigned AddrSpace) { |
7039 | if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) || |
7040 | isa<BasicBlockSDNode>(Val)) |
7041 | return true; |
7042 | |
7043 | if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val)) |
7044 | return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); |
7045 | |
7046 | // TODO: Search through arithmetic, handle arguments and loads |
7047 | // marked nonnull. |
7048 | return false; |
7049 | } |
7050 | |
7051 | SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, |
7052 | SelectionDAG &DAG) const { |
7053 | SDLoc SL(Op); |
7054 | |
7055 | const AMDGPUTargetMachine &TM = |
7056 | static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); |
7057 | |
7058 | unsigned DestAS, SrcAS; |
7059 | SDValue Src; |
7060 | bool IsNonNull = false; |
7061 | if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) { |
7062 | SrcAS = ASC->getSrcAddressSpace(); |
7063 | Src = ASC->getOperand(Num: 0); |
7064 | DestAS = ASC->getDestAddressSpace(); |
7065 | } else { |
7066 | assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN && |
7067 | Op.getConstantOperandVal(0) == |
7068 | Intrinsic::amdgcn_addrspacecast_nonnull); |
7069 | Src = Op->getOperand(Num: 1); |
7070 | SrcAS = Op->getConstantOperandVal(Num: 2); |
7071 | DestAS = Op->getConstantOperandVal(Num: 3); |
7072 | IsNonNull = true; |
7073 | } |
7074 | |
7075 | SDValue FlatNullPtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64); |
7076 | |
7077 | // flat -> local/private |
7078 | if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { |
7079 | if (DestAS == AMDGPUAS::LOCAL_ADDRESS || |
7080 | DestAS == AMDGPUAS::PRIVATE_ADDRESS) { |
7081 | SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src); |
7082 | |
7083 | if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS)) |
7084 | return Ptr; |
7085 | |
7086 | unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS); |
7087 | SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32); |
7088 | SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE); |
7089 | |
7090 | return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr, |
7091 | N3: SegmentNullPtr); |
7092 | } |
7093 | } |
7094 | |
7095 | // local/private -> flat |
7096 | if (DestAS == AMDGPUAS::FLAT_ADDRESS) { |
7097 | if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || |
7098 | SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { |
7099 | |
7100 | SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG); |
7101 | SDValue CvtPtr = |
7102 | DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture); |
7103 | CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr); |
7104 | |
7105 | if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS)) |
7106 | return CvtPtr; |
7107 | |
7108 | unsigned NullVal = TM.getNullPointerValue(AddrSpace: SrcAS); |
7109 | SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32); |
7110 | |
7111 | SDValue NonNull |
7112 | = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE); |
7113 | |
7114 | return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr, |
7115 | N3: FlatNullPtr); |
7116 | } |
7117 | } |
7118 | |
7119 | if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && |
7120 | Op.getValueType() == MVT::i64) { |
7121 | const SIMachineFunctionInfo *Info = |
7122 | DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); |
7123 | SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32); |
7124 | SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi); |
7125 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec); |
7126 | } |
7127 | |
7128 | if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && |
7129 | Src.getValueType() == MVT::i64) |
7130 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src); |
7131 | |
7132 | // global <-> flat are no-ops and never emitted. |
7133 | |
7134 | const MachineFunction &MF = DAG.getMachineFunction(); |
7135 | DiagnosticInfoUnsupported InvalidAddrSpaceCast( |
7136 | MF.getFunction(), "invalid addrspacecast" , SL.getDebugLoc()); |
7137 | DAG.getContext()->diagnose(DI: InvalidAddrSpaceCast); |
7138 | |
7139 | return DAG.getUNDEF(VT: Op->getValueType(ResNo: 0)); |
7140 | } |
7141 | |
7142 | // This lowers an INSERT_SUBVECTOR by extracting the individual elements from |
7143 | // the small vector and inserting them into the big vector. That is better than |
7144 | // the default expansion of doing it via a stack slot. Even though the use of |
7145 | // the stack slot would be optimized away afterwards, the stack slot itself |
7146 | // remains. |
7147 | SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, |
7148 | SelectionDAG &DAG) const { |
7149 | SDValue Vec = Op.getOperand(i: 0); |
7150 | SDValue Ins = Op.getOperand(i: 1); |
7151 | SDValue Idx = Op.getOperand(i: 2); |
7152 | EVT VecVT = Vec.getValueType(); |
7153 | EVT InsVT = Ins.getValueType(); |
7154 | EVT EltVT = VecVT.getVectorElementType(); |
7155 | unsigned InsNumElts = InsVT.getVectorNumElements(); |
7156 | unsigned IdxVal = Idx->getAsZExtVal(); |
7157 | SDLoc SL(Op); |
7158 | |
7159 | if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { |
7160 | // Insert 32-bit registers at a time. |
7161 | assert(InsNumElts % 2 == 0 && "expect legal vector types" ); |
7162 | |
7163 | unsigned VecNumElts = VecVT.getVectorNumElements(); |
7164 | EVT NewVecVT = |
7165 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / 2); |
7166 | EVT NewInsVT = InsNumElts == 2 ? MVT::i32 |
7167 | : EVT::getVectorVT(Context&: *DAG.getContext(), |
7168 | VT: MVT::i32, NumElements: InsNumElts / 2); |
7169 | |
7170 | Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec); |
7171 | Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins); |
7172 | |
7173 | for (unsigned I = 0; I != InsNumElts / 2; ++I) { |
7174 | SDValue Elt; |
7175 | if (InsNumElts == 2) { |
7176 | Elt = Ins; |
7177 | } else { |
7178 | Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins, |
7179 | N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32)); |
7180 | } |
7181 | Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt, |
7182 | N3: DAG.getConstant(Val: IdxVal / 2 + I, DL: SL, VT: MVT::i32)); |
7183 | } |
7184 | |
7185 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec); |
7186 | } |
7187 | |
7188 | for (unsigned I = 0; I != InsNumElts; ++I) { |
7189 | SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins, |
7190 | N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32)); |
7191 | Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt, |
7192 | N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32)); |
7193 | } |
7194 | return Vec; |
7195 | } |
7196 | |
7197 | SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, |
7198 | SelectionDAG &DAG) const { |
7199 | SDValue Vec = Op.getOperand(i: 0); |
7200 | SDValue InsVal = Op.getOperand(i: 1); |
7201 | SDValue Idx = Op.getOperand(i: 2); |
7202 | EVT VecVT = Vec.getValueType(); |
7203 | EVT EltVT = VecVT.getVectorElementType(); |
7204 | unsigned VecSize = VecVT.getSizeInBits(); |
7205 | unsigned EltSize = EltVT.getSizeInBits(); |
7206 | SDLoc SL(Op); |
7207 | |
7208 | // Specially handle the case of v4i16 with static indexing. |
7209 | unsigned NumElts = VecVT.getVectorNumElements(); |
7210 | auto KIdx = dyn_cast<ConstantSDNode>(Val&: Idx); |
7211 | if (NumElts == 4 && EltSize == 16 && KIdx) { |
7212 | SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec); |
7213 | |
7214 | SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec, |
7215 | N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)); |
7216 | SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec, |
7217 | N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)); |
7218 | |
7219 | SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf); |
7220 | SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf); |
7221 | |
7222 | unsigned Idx = KIdx->getZExtValue(); |
7223 | bool InsertLo = Idx < 2; |
7224 | SDValue InsHalf = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16, |
7225 | N1: InsertLo ? LoVec : HiVec, |
7226 | N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal), |
7227 | N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - 2), DL: SL, VT: MVT::i32)); |
7228 | |
7229 | InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf); |
7230 | |
7231 | SDValue Concat = InsertLo ? |
7232 | DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: { InsHalf, HiHalf }) : |
7233 | DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: { LoHalf, InsHalf }); |
7234 | |
7235 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat); |
7236 | } |
7237 | |
7238 | // Static indexing does not lower to stack access, and hence there is no need |
7239 | // for special custom lowering to avoid stack access. |
7240 | if (isa<ConstantSDNode>(Val: Idx)) |
7241 | return SDValue(); |
7242 | |
7243 | // Avoid stack access for dynamic indexing by custom lowering to |
7244 | // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec |
7245 | |
7246 | assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits" ); |
7247 | |
7248 | MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize); |
7249 | |
7250 | // Convert vector index to bit-index and get the required bit mask. |
7251 | assert(isPowerOf2_32(EltSize)); |
7252 | const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize); |
7253 | SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32); |
7254 | SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor); |
7255 | SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT, |
7256 | N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx); |
7257 | |
7258 | // 1. Create a congruent vector with the target value in each element. |
7259 | SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, |
7260 | Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal)); |
7261 | |
7262 | // 2. Mask off all other indices except the required index within (1). |
7263 | SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal); |
7264 | |
7265 | // 3. Mask off the required index within the target vector. |
7266 | SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec); |
7267 | SDValue RHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, |
7268 | N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec); |
7269 | |
7270 | // 4. Get (2) and (3) ORed into the target vector. |
7271 | SDValue BFI = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS); |
7272 | |
7273 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI); |
7274 | } |
7275 | |
7276 | SDValue SITargetLowering::(SDValue Op, |
7277 | SelectionDAG &DAG) const { |
7278 | SDLoc SL(Op); |
7279 | |
7280 | EVT ResultVT = Op.getValueType(); |
7281 | SDValue Vec = Op.getOperand(i: 0); |
7282 | SDValue Idx = Op.getOperand(i: 1); |
7283 | EVT VecVT = Vec.getValueType(); |
7284 | unsigned VecSize = VecVT.getSizeInBits(); |
7285 | EVT EltVT = VecVT.getVectorElementType(); |
7286 | |
7287 | DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); |
7288 | |
7289 | // Make sure we do any optimizations that will make it easier to fold |
7290 | // source modifiers before obscuring it with bit operations. |
7291 | |
7292 | // XXX - Why doesn't this get called when vector_shuffle is expanded? |
7293 | if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI)) |
7294 | return Combined; |
7295 | |
7296 | if (VecSize == 128 || VecSize == 256 || VecSize == 512) { |
7297 | SDValue Lo, Hi; |
7298 | EVT LoVT, HiVT; |
7299 | std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: VecVT); |
7300 | |
7301 | if (VecSize == 128) { |
7302 | SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec); |
7303 | Lo = DAG.getBitcast(VT: LoVT, |
7304 | V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2, |
7305 | N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32))); |
7306 | Hi = DAG.getBitcast(VT: HiVT, |
7307 | V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2, |
7308 | N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32))); |
7309 | } else if (VecSize == 256) { |
7310 | SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec); |
7311 | SDValue Parts[4]; |
7312 | for (unsigned P = 0; P < 4; ++P) { |
7313 | Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2, |
7314 | N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32)); |
7315 | } |
7316 | |
7317 | Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64, |
7318 | N1: Parts[0], N2: Parts[1])); |
7319 | Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64, |
7320 | N1: Parts[2], N2: Parts[3])); |
7321 | } else { |
7322 | assert(VecSize == 512); |
7323 | |
7324 | SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec); |
7325 | SDValue Parts[8]; |
7326 | for (unsigned P = 0; P < 8; ++P) { |
7327 | Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2, |
7328 | N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32)); |
7329 | } |
7330 | |
7331 | Lo = DAG.getBitcast(VT: LoVT, |
7332 | V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64, |
7333 | N1: Parts[0], N2: Parts[1], N3: Parts[2], N4: Parts[3])); |
7334 | Hi = DAG.getBitcast(VT: HiVT, |
7335 | V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64, |
7336 | N1: Parts[4], N2: Parts[5],N3: Parts[6], N4: Parts[7])); |
7337 | } |
7338 | |
7339 | EVT IdxVT = Idx.getValueType(); |
7340 | unsigned NElem = VecVT.getVectorNumElements(); |
7341 | assert(isPowerOf2_32(NElem)); |
7342 | SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT); |
7343 | SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask); |
7344 | SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT); |
7345 | return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx); |
7346 | } |
7347 | |
7348 | assert(VecSize <= 64); |
7349 | |
7350 | MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize); |
7351 | |
7352 | // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly. |
7353 | SDValue VecBC = peekThroughBitcasts(V: Vec); |
7354 | if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) { |
7355 | SDValue Src = VecBC.getOperand(i: 0); |
7356 | Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src); |
7357 | Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT); |
7358 | } |
7359 | |
7360 | unsigned EltSize = EltVT.getSizeInBits(); |
7361 | assert(isPowerOf2_32(EltSize)); |
7362 | |
7363 | SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32); |
7364 | |
7365 | // Convert vector index to bit-index (* EltSize) |
7366 | SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor); |
7367 | |
7368 | SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec); |
7369 | SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx); |
7370 | |
7371 | if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) { |
7372 | SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt); |
7373 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result); |
7374 | } |
7375 | |
7376 | return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT); |
7377 | } |
7378 | |
7379 | static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) { |
7380 | assert(Elt % 2 == 0); |
7381 | return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); |
7382 | } |
7383 | |
7384 | SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, |
7385 | SelectionDAG &DAG) const { |
7386 | SDLoc SL(Op); |
7387 | EVT ResultVT = Op.getValueType(); |
7388 | ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op); |
7389 | |
7390 | EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16; |
7391 | EVT EltVT = PackVT.getVectorElementType(); |
7392 | int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements(); |
7393 | |
7394 | // vector_shuffle <0,1,6,7> lhs, rhs |
7395 | // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) |
7396 | // |
7397 | // vector_shuffle <6,7,2,3> lhs, rhs |
7398 | // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) |
7399 | // |
7400 | // vector_shuffle <6,7,0,1> lhs, rhs |
7401 | // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) |
7402 | |
7403 | // Avoid scalarizing when both halves are reading from consecutive elements. |
7404 | SmallVector<SDValue, 4> Pieces; |
7405 | for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { |
7406 | if (elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) { |
7407 | const int Idx = SVN->getMaskElt(Idx: I); |
7408 | int VecIdx = Idx < SrcNumElts ? 0 : 1; |
7409 | int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; |
7410 | SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, |
7411 | VT: PackVT, N1: SVN->getOperand(Num: VecIdx), |
7412 | N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32)); |
7413 | Pieces.push_back(Elt: SubVec); |
7414 | } else { |
7415 | const int Idx0 = SVN->getMaskElt(Idx: I); |
7416 | const int Idx1 = SVN->getMaskElt(Idx: I + 1); |
7417 | int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1; |
7418 | int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1; |
7419 | int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; |
7420 | int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; |
7421 | |
7422 | SDValue Vec0 = SVN->getOperand(Num: VecIdx0); |
7423 | SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, |
7424 | N1: Vec0, N2: DAG.getConstant(Val: EltIdx0, DL: SL, VT: MVT::i32)); |
7425 | |
7426 | SDValue Vec1 = SVN->getOperand(Num: VecIdx1); |
7427 | SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, |
7428 | N1: Vec1, N2: DAG.getConstant(Val: EltIdx1, DL: SL, VT: MVT::i32)); |
7429 | Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: { Elt0, Elt1 })); |
7430 | } |
7431 | } |
7432 | |
7433 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces); |
7434 | } |
7435 | |
7436 | SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, |
7437 | SelectionDAG &DAG) const { |
7438 | SDValue SVal = Op.getOperand(i: 0); |
7439 | EVT ResultVT = Op.getValueType(); |
7440 | EVT SValVT = SVal.getValueType(); |
7441 | SDValue UndefVal = DAG.getUNDEF(VT: SValVT); |
7442 | SDLoc SL(Op); |
7443 | |
7444 | SmallVector<SDValue, 8> VElts; |
7445 | VElts.push_back(Elt: SVal); |
7446 | for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I) |
7447 | VElts.push_back(Elt: UndefVal); |
7448 | |
7449 | return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts); |
7450 | } |
7451 | |
7452 | SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, |
7453 | SelectionDAG &DAG) const { |
7454 | SDLoc SL(Op); |
7455 | EVT VT = Op.getValueType(); |
7456 | |
7457 | if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || |
7458 | VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { |
7459 | EVT HalfVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), |
7460 | NumElements: VT.getVectorNumElements() / 2); |
7461 | MVT HalfIntVT = MVT::getIntegerVT(BitWidth: HalfVT.getSizeInBits()); |
7462 | |
7463 | // Turn into pair of packed build_vectors. |
7464 | // TODO: Special case for constants that can be materialized with s_mov_b64. |
7465 | SmallVector<SDValue, 4> LoOps, HiOps; |
7466 | for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) { |
7467 | LoOps.push_back(Elt: Op.getOperand(i: I)); |
7468 | HiOps.push_back(Elt: Op.getOperand(i: I + E)); |
7469 | } |
7470 | SDValue Lo = DAG.getBuildVector(VT: HalfVT, DL: SL, Ops: LoOps); |
7471 | SDValue Hi = DAG.getBuildVector(VT: HalfVT, DL: SL, Ops: HiOps); |
7472 | |
7473 | SDValue CastLo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HalfIntVT, Operand: Lo); |
7474 | SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HalfIntVT, Operand: Hi); |
7475 | |
7476 | SDValue Blend = DAG.getBuildVector(VT: MVT::getVectorVT(VT: HalfIntVT, NumElements: 2), DL: SL, |
7477 | Ops: { CastLo, CastHi }); |
7478 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend); |
7479 | } |
7480 | |
7481 | if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) { |
7482 | EVT QuarterVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), |
7483 | NumElements: VT.getVectorNumElements() / 4); |
7484 | MVT QuarterIntVT = MVT::getIntegerVT(BitWidth: QuarterVT.getSizeInBits()); |
7485 | |
7486 | SmallVector<SDValue, 4> Parts[4]; |
7487 | for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) { |
7488 | for (unsigned P = 0; P < 4; ++P) |
7489 | Parts[P].push_back(Elt: Op.getOperand(i: I + P * E)); |
7490 | } |
7491 | SDValue Casts[4]; |
7492 | for (unsigned P = 0; P < 4; ++P) { |
7493 | SDValue Vec = DAG.getBuildVector(VT: QuarterVT, DL: SL, Ops: Parts[P]); |
7494 | Casts[P] = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: QuarterIntVT, Operand: Vec); |
7495 | } |
7496 | |
7497 | SDValue Blend = |
7498 | DAG.getBuildVector(VT: MVT::getVectorVT(VT: QuarterIntVT, NumElements: 4), DL: SL, Ops: Casts); |
7499 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend); |
7500 | } |
7501 | |
7502 | if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) { |
7503 | EVT QuarterVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), |
7504 | NumElements: VT.getVectorNumElements() / 8); |
7505 | MVT QuarterIntVT = MVT::getIntegerVT(BitWidth: QuarterVT.getSizeInBits()); |
7506 | |
7507 | SmallVector<SDValue, 8> Parts[8]; |
7508 | for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) { |
7509 | for (unsigned P = 0; P < 8; ++P) |
7510 | Parts[P].push_back(Elt: Op.getOperand(i: I + P * E)); |
7511 | } |
7512 | SDValue Casts[8]; |
7513 | for (unsigned P = 0; P < 8; ++P) { |
7514 | SDValue Vec = DAG.getBuildVector(VT: QuarterVT, DL: SL, Ops: Parts[P]); |
7515 | Casts[P] = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: QuarterIntVT, Operand: Vec); |
7516 | } |
7517 | |
7518 | SDValue Blend = |
7519 | DAG.getBuildVector(VT: MVT::getVectorVT(VT: QuarterIntVT, NumElements: 8), DL: SL, Ops: Casts); |
7520 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend); |
7521 | } |
7522 | |
7523 | assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16); |
7524 | assert(!Subtarget->hasVOP3PInsts() && "this should be legal" ); |
7525 | |
7526 | SDValue Lo = Op.getOperand(i: 0); |
7527 | SDValue Hi = Op.getOperand(i: 1); |
7528 | |
7529 | // Avoid adding defined bits with the zero_extend. |
7530 | if (Hi.isUndef()) { |
7531 | Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo); |
7532 | SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo); |
7533 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo); |
7534 | } |
7535 | |
7536 | Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi); |
7537 | Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi); |
7538 | |
7539 | SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi, |
7540 | N2: DAG.getConstant(Val: 16, DL: SL, VT: MVT::i32)); |
7541 | if (Lo.isUndef()) |
7542 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi); |
7543 | |
7544 | Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo); |
7545 | Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo); |
7546 | |
7547 | SDValue Or = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi); |
7548 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or); |
7549 | } |
7550 | |
7551 | bool |
7552 | SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { |
7553 | // OSes that use ELF REL relocations (instead of RELA) can only store a |
7554 | // 32-bit addend in the instruction, so it is not safe to allow offset folding |
7555 | // which can create arbitrary 64-bit addends. (This is only a problem for |
7556 | // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by |
7557 | // the high 32 bits of the addend.) |
7558 | // |
7559 | // This should be kept in sync with how HasRelocationAddend is initialized in |
7560 | // the constructor of ELFAMDGPUAsmBackend. |
7561 | if (!Subtarget->isAmdHsaOS()) |
7562 | return false; |
7563 | |
7564 | // We can fold offsets for anything that doesn't require a GOT relocation. |
7565 | return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || |
7566 | GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || |
7567 | GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && |
7568 | !shouldEmitGOTReloc(GV: GA->getGlobal()); |
7569 | } |
7570 | |
7571 | static SDValue |
7572 | buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, |
7573 | const SDLoc &DL, int64_t Offset, EVT PtrVT, |
7574 | unsigned GAFlags = SIInstrInfo::MO_NONE) { |
7575 | assert(isInt<32>(Offset + 4) && "32-bit offset is expected!" ); |
7576 | // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is |
7577 | // lowered to the following code sequence: |
7578 | // |
7579 | // For constant address space: |
7580 | // s_getpc_b64 s[0:1] |
7581 | // s_add_u32 s0, s0, $symbol |
7582 | // s_addc_u32 s1, s1, 0 |
7583 | // |
7584 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then |
7585 | // a fixup or relocation is emitted to replace $symbol with a literal |
7586 | // constant, which is a pc-relative offset from the encoding of the $symbol |
7587 | // operand to the global variable. |
7588 | // |
7589 | // For global address space: |
7590 | // s_getpc_b64 s[0:1] |
7591 | // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo |
7592 | // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi |
7593 | // |
7594 | // s_getpc_b64 returns the address of the s_add_u32 instruction and then |
7595 | // fixups or relocations are emitted to replace $symbol@*@lo and |
7596 | // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, |
7597 | // which is a 64-bit pc-relative offset from the encoding of the $symbol |
7598 | // operand to the global variable. |
7599 | SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags); |
7600 | SDValue PtrHi; |
7601 | if (GAFlags == SIInstrInfo::MO_NONE) |
7602 | PtrHi = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32); |
7603 | else |
7604 | PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + 1); |
7605 | return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi); |
7606 | } |
7607 | |
7608 | SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, |
7609 | SDValue Op, |
7610 | SelectionDAG &DAG) const { |
7611 | GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op); |
7612 | SDLoc DL(GSD); |
7613 | EVT PtrVT = Op.getValueType(); |
7614 | |
7615 | const GlobalValue *GV = GSD->getGlobal(); |
7616 | if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && |
7617 | shouldUseLDSConstAddress(GV)) || |
7618 | GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || |
7619 | GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { |
7620 | if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && |
7621 | GV->hasExternalLinkage()) { |
7622 | Type *Ty = GV->getValueType(); |
7623 | // HIP uses an unsized array `extern __shared__ T s[]` or similar |
7624 | // zero-sized type in other languages to declare the dynamic shared |
7625 | // memory which size is not known at the compile time. They will be |
7626 | // allocated by the runtime and placed directly after the static |
7627 | // allocated ones. They all share the same offset. |
7628 | if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { |
7629 | assert(PtrVT == MVT::i32 && "32-bit pointer is expected." ); |
7630 | // Adjust alignment for that dynamic shared memory array. |
7631 | Function &F = DAG.getMachineFunction().getFunction(); |
7632 | MFI->setDynLDSAlign(F, GV: *cast<GlobalVariable>(Val: GV)); |
7633 | MFI->setUsesDynamicLDS(true); |
7634 | return SDValue( |
7635 | DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), 0); |
7636 | } |
7637 | } |
7638 | return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); |
7639 | } |
7640 | |
7641 | if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
7642 | SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(), |
7643 | TargetFlags: SIInstrInfo::MO_ABS32_LO); |
7644 | return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA); |
7645 | } |
7646 | |
7647 | if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { |
7648 | SDValue AddrLo = DAG.getTargetGlobalAddress( |
7649 | GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO); |
7650 | AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), 0}; |
7651 | |
7652 | SDValue AddrHi = DAG.getTargetGlobalAddress( |
7653 | GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI); |
7654 | AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), 0}; |
7655 | |
7656 | return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi); |
7657 | } |
7658 | |
7659 | if (shouldEmitFixup(GV)) |
7660 | return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT); |
7661 | |
7662 | if (shouldEmitPCReloc(GV)) |
7663 | return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT, |
7664 | GAFlags: SIInstrInfo::MO_REL32); |
7665 | |
7666 | SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT, |
7667 | GAFlags: SIInstrInfo::MO_GOTPCREL32); |
7668 | |
7669 | Type *Ty = PtrVT.getTypeForEVT(Context&: *DAG.getContext()); |
7670 | PointerType *PtrTy = PointerType::get(ElementType: Ty, AddressSpace: AMDGPUAS::CONSTANT_ADDRESS); |
7671 | const DataLayout &DataLayout = DAG.getDataLayout(); |
7672 | Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy); |
7673 | MachinePointerInfo PtrInfo |
7674 | = MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()); |
7675 | |
7676 | return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment, |
7677 | MMOFlags: MachineMemOperand::MODereferenceable | |
7678 | MachineMemOperand::MOInvariant); |
7679 | } |
7680 | |
7681 | SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, |
7682 | const SDLoc &DL, SDValue V) const { |
7683 | // We can't use S_MOV_B32 directly, because there is no way to specify m0 as |
7684 | // the destination register. |
7685 | // |
7686 | // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, |
7687 | // so we will end up with redundant moves to m0. |
7688 | // |
7689 | // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. |
7690 | |
7691 | // A Null SDValue creates a glue result. |
7692 | SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue, |
7693 | Op1: V, Op2: Chain); |
7694 | return SDValue(M0, 0); |
7695 | } |
7696 | |
7697 | SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, |
7698 | SDValue Op, |
7699 | MVT VT, |
7700 | unsigned Offset) const { |
7701 | SDLoc SL(Op); |
7702 | SDValue Param = lowerKernargMemParameter( |
7703 | DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align(4), Signed: false); |
7704 | // The local size values will have the hi 16-bits as zero. |
7705 | return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param, |
7706 | N2: DAG.getValueType(VT)); |
7707 | } |
7708 | |
7709 | static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, |
7710 | EVT VT) { |
7711 | DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), |
7712 | "non-hsa intrinsic with hsa target" , |
7713 | DL.getDebugLoc()); |
7714 | DAG.getContext()->diagnose(DI: BadIntrin); |
7715 | return DAG.getUNDEF(VT); |
7716 | } |
7717 | |
7718 | static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, |
7719 | EVT VT) { |
7720 | DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), |
7721 | "intrinsic not supported on subtarget" , |
7722 | DL.getDebugLoc()); |
7723 | DAG.getContext()->diagnose(DI: BadIntrin); |
7724 | return DAG.getUNDEF(VT); |
7725 | } |
7726 | |
7727 | static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, |
7728 | ArrayRef<SDValue> Elts) { |
7729 | assert(!Elts.empty()); |
7730 | MVT Type; |
7731 | unsigned NumElts = Elts.size(); |
7732 | |
7733 | if (NumElts <= 12) { |
7734 | Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts); |
7735 | } else { |
7736 | assert(Elts.size() <= 16); |
7737 | Type = MVT::v16f32; |
7738 | NumElts = 16; |
7739 | } |
7740 | |
7741 | SmallVector<SDValue, 16> VecElts(NumElts); |
7742 | for (unsigned i = 0; i < Elts.size(); ++i) { |
7743 | SDValue Elt = Elts[i]; |
7744 | if (Elt.getValueType() != MVT::f32) |
7745 | Elt = DAG.getBitcast(VT: MVT::f32, V: Elt); |
7746 | VecElts[i] = Elt; |
7747 | } |
7748 | for (unsigned i = Elts.size(); i < NumElts; ++i) |
7749 | VecElts[i] = DAG.getUNDEF(VT: MVT::f32); |
7750 | |
7751 | if (NumElts == 1) |
7752 | return VecElts[0]; |
7753 | return DAG.getBuildVector(VT: Type, DL, Ops: VecElts); |
7754 | } |
7755 | |
7756 | static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, |
7757 | SDValue Src, int ) { |
7758 | EVT SrcVT = Src.getValueType(); |
7759 | |
7760 | SmallVector<SDValue, 8> Elts; |
7761 | |
7762 | if (SrcVT.isVector()) |
7763 | DAG.ExtractVectorElements(Op: Src, Args&: Elts); |
7764 | else |
7765 | Elts.push_back(Elt: Src); |
7766 | |
7767 | SDValue Undef = DAG.getUNDEF(VT: SrcVT.getScalarType()); |
7768 | while (ExtraElts--) |
7769 | Elts.push_back(Elt: Undef); |
7770 | |
7771 | return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts); |
7772 | } |
7773 | |
7774 | // Re-construct the required return value for a image load intrinsic. |
7775 | // This is more complicated due to the optional use TexFailCtrl which means the required |
7776 | // return type is an aggregate |
7777 | static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, |
7778 | ArrayRef<EVT> ResultTypes, bool IsTexFail, |
7779 | bool Unpacked, bool IsD16, int DMaskPop, |
7780 | int NumVDataDwords, bool IsAtomicPacked16Bit, |
7781 | const SDLoc &DL) { |
7782 | // Determine the required return type. This is the same regardless of IsTexFail flag |
7783 | EVT ReqRetVT = ResultTypes[0]; |
7784 | int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; |
7785 | int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit) |
7786 | ? (ReqRetNumElts + 1) / 2 |
7787 | : ReqRetNumElts; |
7788 | |
7789 | int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2; |
7790 | |
7791 | MVT DataDwordVT = NumDataDwords == 1 ? |
7792 | MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords); |
7793 | |
7794 | MVT MaskPopVT = MaskPopDwords == 1 ? |
7795 | MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords); |
7796 | |
7797 | SDValue Data(Result, 0); |
7798 | SDValue TexFail; |
7799 | |
7800 | if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) { |
7801 | SDValue ZeroIdx = DAG.getConstant(Val: 0, DL, VT: MVT::i32); |
7802 | if (MaskPopVT.isVector()) { |
7803 | Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT, |
7804 | N1: SDValue(Result, 0), N2: ZeroIdx); |
7805 | } else { |
7806 | Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT, |
7807 | N1: SDValue(Result, 0), N2: ZeroIdx); |
7808 | } |
7809 | } |
7810 | |
7811 | if (DataDwordVT.isVector() && !IsAtomicPacked16Bit) |
7812 | Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data, |
7813 | ExtraElts: NumDataDwords - MaskPopDwords); |
7814 | |
7815 | if (IsD16) |
7816 | Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked); |
7817 | |
7818 | EVT LegalReqRetVT = ReqRetVT; |
7819 | if (!ReqRetVT.isVector()) { |
7820 | if (!Data.getValueType().isInteger()) |
7821 | Data = DAG.getNode(Opcode: ISD::BITCAST, DL, |
7822 | VT: Data.getValueType().changeTypeToInteger(), Operand: Data); |
7823 | Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data); |
7824 | } else { |
7825 | // We need to widen the return vector to a legal type |
7826 | if ((ReqRetVT.getVectorNumElements() % 2) == 1 && |
7827 | ReqRetVT.getVectorElementType().getSizeInBits() == 16) { |
7828 | LegalReqRetVT = |
7829 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(), |
7830 | NumElements: ReqRetVT.getVectorNumElements() + 1); |
7831 | } |
7832 | } |
7833 | Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data); |
7834 | |
7835 | if (IsTexFail) { |
7836 | TexFail = |
7837 | DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue(Result, 0), |
7838 | N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32)); |
7839 | |
7840 | return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL); |
7841 | } |
7842 | |
7843 | if (Result->getNumValues() == 1) |
7844 | return Data; |
7845 | |
7846 | return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL); |
7847 | } |
7848 | |
7849 | static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, |
7850 | SDValue *LWE, bool &IsTexFail) { |
7851 | auto TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode()); |
7852 | |
7853 | uint64_t Value = TexFailCtrlConst->getZExtValue(); |
7854 | if (Value) { |
7855 | IsTexFail = true; |
7856 | } |
7857 | |
7858 | SDLoc DL(TexFailCtrlConst); |
7859 | *TFE = DAG.getTargetConstant(Val: (Value & 0x1) ? 1 : 0, DL, VT: MVT::i32); |
7860 | Value &= ~(uint64_t)0x1; |
7861 | *LWE = DAG.getTargetConstant(Val: (Value & 0x2) ? 1 : 0, DL, VT: MVT::i32); |
7862 | Value &= ~(uint64_t)0x2; |
7863 | |
7864 | return Value == 0; |
7865 | } |
7866 | |
7867 | static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op, |
7868 | MVT PackVectorVT, |
7869 | SmallVectorImpl<SDValue> &PackedAddrs, |
7870 | unsigned DimIdx, unsigned EndIdx, |
7871 | unsigned NumGradients) { |
7872 | SDLoc DL(Op); |
7873 | for (unsigned I = DimIdx; I < EndIdx; I++) { |
7874 | SDValue Addr = Op.getOperand(i: I); |
7875 | |
7876 | // Gradients are packed with undef for each coordinate. |
7877 | // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this: |
7878 | // 1D: undef,dx/dh; undef,dx/dv |
7879 | // 2D: dy/dh,dx/dh; dy/dv,dx/dv |
7880 | // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv |
7881 | if (((I + 1) >= EndIdx) || |
7882 | ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || |
7883 | I == DimIdx + NumGradients - 1))) { |
7884 | if (Addr.getValueType() != MVT::i16) |
7885 | Addr = DAG.getBitcast(VT: MVT::i16, V: Addr); |
7886 | Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr); |
7887 | } else { |
7888 | Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)}); |
7889 | I++; |
7890 | } |
7891 | Addr = DAG.getBitcast(VT: MVT::f32, V: Addr); |
7892 | PackedAddrs.push_back(Elt: Addr); |
7893 | } |
7894 | } |
7895 | |
7896 | SDValue SITargetLowering::lowerImage(SDValue Op, |
7897 | const AMDGPU::ImageDimIntrinsicInfo *Intr, |
7898 | SelectionDAG &DAG, bool WithChain) const { |
7899 | SDLoc DL(Op); |
7900 | MachineFunction &MF = DAG.getMachineFunction(); |
7901 | const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>(); |
7902 | const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = |
7903 | AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode); |
7904 | const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim); |
7905 | unsigned IntrOpcode = Intr->BaseOpcode; |
7906 | bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget); |
7907 | bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget); |
7908 | bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget); |
7909 | |
7910 | SmallVector<EVT, 3> ResultTypes(Op->values()); |
7911 | SmallVector<EVT, 3> OrigResultTypes(Op->values()); |
7912 | bool IsD16 = false; |
7913 | bool IsG16 = false; |
7914 | bool IsA16 = false; |
7915 | SDValue VData; |
7916 | int NumVDataDwords = 0; |
7917 | bool AdjustRetType = false; |
7918 | bool IsAtomicPacked16Bit = false; |
7919 | |
7920 | // Offset of intrinsic arguments |
7921 | const unsigned ArgOffset = WithChain ? 2 : 1; |
7922 | |
7923 | unsigned DMask; |
7924 | unsigned DMaskLanes = 0; |
7925 | |
7926 | if (BaseOpcode->Atomic) { |
7927 | VData = Op.getOperand(i: 2); |
7928 | |
7929 | IsAtomicPacked16Bit = |
7930 | (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || |
7931 | Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); |
7932 | |
7933 | bool Is64Bit = VData.getValueSizeInBits() == 64; |
7934 | if (BaseOpcode->AtomicX2) { |
7935 | SDValue VData2 = Op.getOperand(i: 3); |
7936 | VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, |
7937 | Ops: {VData, VData2}); |
7938 | if (Is64Bit) |
7939 | VData = DAG.getBitcast(VT: MVT::v4i32, V: VData); |
7940 | |
7941 | ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; |
7942 | DMask = Is64Bit ? 0xf : 0x3; |
7943 | NumVDataDwords = Is64Bit ? 4 : 2; |
7944 | } else { |
7945 | DMask = Is64Bit ? 0x3 : 0x1; |
7946 | NumVDataDwords = Is64Bit ? 2 : 1; |
7947 | } |
7948 | } else { |
7949 | DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex); |
7950 | DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask); |
7951 | |
7952 | if (BaseOpcode->Store) { |
7953 | VData = Op.getOperand(i: 2); |
7954 | |
7955 | MVT StoreVT = VData.getSimpleValueType(); |
7956 | if (StoreVT.getScalarType() == MVT::f16) { |
7957 | if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) |
7958 | return Op; // D16 is unsupported for this instruction |
7959 | |
7960 | IsD16 = true; |
7961 | VData = handleD16VData(VData, DAG, ImageStore: true); |
7962 | } |
7963 | |
7964 | NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; |
7965 | } else if (!BaseOpcode->NoReturn) { |
7966 | // Work out the num dwords based on the dmask popcount and underlying type |
7967 | // and whether packing is supported. |
7968 | MVT LoadVT = ResultTypes[0].getSimpleVT(); |
7969 | if (LoadVT.getScalarType() == MVT::f16) { |
7970 | if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) |
7971 | return Op; // D16 is unsupported for this instruction |
7972 | |
7973 | IsD16 = true; |
7974 | } |
7975 | |
7976 | // Confirm that the return type is large enough for the dmask specified |
7977 | if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || |
7978 | (!LoadVT.isVector() && DMaskLanes > 1)) |
7979 | return Op; |
7980 | |
7981 | // The sq block of gfx8 and gfx9 do not estimate register use correctly |
7982 | // for d16 image_gather4, image_gather4_l, and image_gather4_lz |
7983 | // instructions. |
7984 | if (IsD16 && !Subtarget->hasUnpackedD16VMem() && |
7985 | !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) |
7986 | NumVDataDwords = (DMaskLanes + 1) / 2; |
7987 | else |
7988 | NumVDataDwords = DMaskLanes; |
7989 | |
7990 | AdjustRetType = true; |
7991 | } |
7992 | } |
7993 | |
7994 | unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; |
7995 | SmallVector<SDValue, 4> VAddrs; |
7996 | |
7997 | // Check for 16 bit addresses or derivatives and pack if true. |
7998 | MVT VAddrVT = |
7999 | Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType(); |
8000 | MVT VAddrScalarVT = VAddrVT.getScalarType(); |
8001 | MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; |
8002 | IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; |
8003 | |
8004 | VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType(); |
8005 | VAddrScalarVT = VAddrVT.getScalarType(); |
8006 | MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; |
8007 | IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; |
8008 | |
8009 | // Push back extra arguments. |
8010 | for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { |
8011 | if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) { |
8012 | assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument" ); |
8013 | // Special handling of bias when A16 is on. Bias is of type half but |
8014 | // occupies full 32-bit. |
8015 | SDValue Bias = DAG.getBuildVector( |
8016 | VT: MVT::v2f16, DL, |
8017 | Ops: {Op.getOperand(i: ArgOffset + I), DAG.getUNDEF(VT: MVT::f16)}); |
8018 | VAddrs.push_back(Elt: Bias); |
8019 | } else { |
8020 | assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && |
8021 | "Bias needs to be converted to 16 bit in A16 mode" ); |
8022 | VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I)); |
8023 | } |
8024 | } |
8025 | |
8026 | if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { |
8027 | // 16 bit gradients are supported, but are tied to the A16 control |
8028 | // so both gradients and addresses must be 16 bit |
8029 | LLVM_DEBUG( |
8030 | dbgs() << "Failed to lower image intrinsic: 16 bit addresses " |
8031 | "require 16 bit args for both gradients and addresses" ); |
8032 | return Op; |
8033 | } |
8034 | |
8035 | if (IsA16) { |
8036 | if (!ST->hasA16()) { |
8037 | LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " |
8038 | "support 16 bit addresses\n" ); |
8039 | return Op; |
8040 | } |
8041 | } |
8042 | |
8043 | // We've dealt with incorrect input so we know that if IsA16, IsG16 |
8044 | // are set then we have to compress/pack operands (either address, |
8045 | // gradient or both) |
8046 | // In the case where a16 and gradients are tied (no G16 support) then we |
8047 | // have already verified that both IsA16 and IsG16 are true |
8048 | if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { |
8049 | // Activate g16 |
8050 | const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = |
8051 | AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode); |
8052 | IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 |
8053 | } |
8054 | |
8055 | // Add gradients (packed or unpacked) |
8056 | if (IsG16) { |
8057 | // Pack the gradients |
8058 | // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); |
8059 | packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs, |
8060 | DimIdx: ArgOffset + Intr->GradientStart, |
8061 | EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients); |
8062 | } else { |
8063 | for (unsigned I = ArgOffset + Intr->GradientStart; |
8064 | I < ArgOffset + Intr->CoordStart; I++) |
8065 | VAddrs.push_back(Elt: Op.getOperand(i: I)); |
8066 | } |
8067 | |
8068 | // Add addresses (packed or unpacked) |
8069 | if (IsA16) { |
8070 | packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs, |
8071 | DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd, |
8072 | NumGradients: 0 /* No gradients */); |
8073 | } else { |
8074 | // Add uncompressed address |
8075 | for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) |
8076 | VAddrs.push_back(Elt: Op.getOperand(i: I)); |
8077 | } |
8078 | |
8079 | // If the register allocator cannot place the address registers contiguously |
8080 | // without introducing moves, then using the non-sequential address encoding |
8081 | // is always preferable, since it saves VALU instructions and is usually a |
8082 | // wash in terms of code size or even better. |
8083 | // |
8084 | // However, we currently have no way of hinting to the register allocator that |
8085 | // MIMG addresses should be placed contiguously when it is possible to do so, |
8086 | // so force non-NSA for the common 2-address case as a heuristic. |
8087 | // |
8088 | // SIShrinkInstructions will convert NSA encodings to non-NSA after register |
8089 | // allocation when possible. |
8090 | // |
8091 | // Partial NSA is allowed on GFX11+ where the final register is a contiguous |
8092 | // set of the remaining addresses. |
8093 | const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler); |
8094 | const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding(); |
8095 | const bool UseNSA = ST->hasNSAEncoding() && |
8096 | VAddrs.size() >= ST->getNSAThreshold(MF) && |
8097 | (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding); |
8098 | const bool UsePartialNSA = |
8099 | UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize; |
8100 | |
8101 | SDValue VAddr; |
8102 | if (UsePartialNSA) { |
8103 | VAddr = getBuildDwordsVector(DAG, DL, |
8104 | Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1)); |
8105 | } |
8106 | else if (!UseNSA) { |
8107 | VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs); |
8108 | } |
8109 | |
8110 | SDValue True = DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1); |
8111 | SDValue False = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1); |
8112 | SDValue Unorm; |
8113 | if (!BaseOpcode->Sampler) { |
8114 | Unorm = True; |
8115 | } else { |
8116 | uint64_t UnormConst = |
8117 | Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex); |
8118 | |
8119 | Unorm = UnormConst ? True : False; |
8120 | } |
8121 | |
8122 | SDValue TFE; |
8123 | SDValue LWE; |
8124 | SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex); |
8125 | bool IsTexFail = false; |
8126 | if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail)) |
8127 | return Op; |
8128 | |
8129 | if (IsTexFail) { |
8130 | if (!DMaskLanes) { |
8131 | // Expecting to get an error flag since TFC is on - and dmask is 0 |
8132 | // Force dmask to be at least 1 otherwise the instruction will fail |
8133 | DMask = 0x1; |
8134 | DMaskLanes = 1; |
8135 | NumVDataDwords = 1; |
8136 | } |
8137 | NumVDataDwords += 1; |
8138 | AdjustRetType = true; |
8139 | } |
8140 | |
8141 | // Has something earlier tagged that the return type needs adjusting |
8142 | // This happens if the instruction is a load or has set TexFailCtrl flags |
8143 | if (AdjustRetType) { |
8144 | // NumVDataDwords reflects the true number of dwords required in the return type |
8145 | if (DMaskLanes == 0 && !BaseOpcode->Store) { |
8146 | // This is a no-op load. This can be eliminated |
8147 | SDValue Undef = DAG.getUNDEF(VT: Op.getValueType()); |
8148 | if (isa<MemSDNode>(Val: Op)) |
8149 | return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL); |
8150 | return Undef; |
8151 | } |
8152 | |
8153 | EVT NewVT = NumVDataDwords > 1 ? |
8154 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumVDataDwords) |
8155 | : MVT::i32; |
8156 | |
8157 | ResultTypes[0] = NewVT; |
8158 | if (ResultTypes.size() == 3) { |
8159 | // Original result was aggregate type used for TexFailCtrl results |
8160 | // The actual instruction returns as a vector type which has now been |
8161 | // created. Remove the aggregate result. |
8162 | ResultTypes.erase(CI: &ResultTypes[1]); |
8163 | } |
8164 | } |
8165 | |
8166 | unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex); |
8167 | if (BaseOpcode->Atomic) |
8168 | CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization |
8169 | if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | |
8170 | AMDGPU::CPol::VOLATILE)) |
8171 | return Op; |
8172 | |
8173 | SmallVector<SDValue, 26> Ops; |
8174 | if (BaseOpcode->Store || BaseOpcode->Atomic) |
8175 | Ops.push_back(Elt: VData); // vdata |
8176 | if (UsePartialNSA) { |
8177 | append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1)); |
8178 | Ops.push_back(Elt: VAddr); |
8179 | } |
8180 | else if (UseNSA) |
8181 | append_range(C&: Ops, R&: VAddrs); |
8182 | else |
8183 | Ops.push_back(Elt: VAddr); |
8184 | Ops.push_back(Elt: Op.getOperand(i: ArgOffset + Intr->RsrcIndex)); |
8185 | if (BaseOpcode->Sampler) |
8186 | Ops.push_back(Elt: Op.getOperand(i: ArgOffset + Intr->SampIndex)); |
8187 | Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32)); |
8188 | if (IsGFX10Plus) |
8189 | Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32)); |
8190 | if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) |
8191 | Ops.push_back(Elt: Unorm); |
8192 | Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32)); |
8193 | Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9 |
8194 | ST->hasFeature(Feature: AMDGPU::FeatureR128A16) ? True : False); |
8195 | if (IsGFX10Plus) |
8196 | Ops.push_back(Elt: IsA16 ? True : False); |
8197 | if (!Subtarget->hasGFX90AInsts()) { |
8198 | Ops.push_back(Elt: TFE); //tfe |
8199 | } else if (TFE->getAsZExtVal()) { |
8200 | report_fatal_error(reason: "TFE is not supported on this GPU" ); |
8201 | } |
8202 | if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) |
8203 | Ops.push_back(Elt: LWE); // lwe |
8204 | if (!IsGFX10Plus) |
8205 | Ops.push_back(Elt: DimInfo->DA ? True : False); |
8206 | if (BaseOpcode->HasD16) |
8207 | Ops.push_back(Elt: IsD16 ? True : False); |
8208 | if (isa<MemSDNode>(Val: Op)) |
8209 | Ops.push_back(Elt: Op.getOperand(i: 0)); // chain |
8210 | |
8211 | int NumVAddrDwords = |
8212 | UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; |
8213 | int Opcode = -1; |
8214 | |
8215 | if (IsGFX12Plus) { |
8216 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12, |
8217 | VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords); |
8218 | } else if (IsGFX11Plus) { |
8219 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, |
8220 | MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA |
8221 | : AMDGPU::MIMGEncGfx11Default, |
8222 | VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords); |
8223 | } else if (IsGFX10Plus) { |
8224 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, |
8225 | MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA |
8226 | : AMDGPU::MIMGEncGfx10Default, |
8227 | VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords); |
8228 | } else { |
8229 | if (Subtarget->hasGFX90AInsts()) { |
8230 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a, |
8231 | VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords); |
8232 | if (Opcode == -1) |
8233 | report_fatal_error( |
8234 | reason: "requested image instruction is not supported on this GPU" ); |
8235 | } |
8236 | if (Opcode == -1 && |
8237 | Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8238 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8, |
8239 | VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords); |
8240 | if (Opcode == -1) |
8241 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6, |
8242 | VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords); |
8243 | } |
8244 | if (Opcode == -1) |
8245 | return Op; |
8246 | |
8247 | MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops); |
8248 | if (auto MemOp = dyn_cast<MemSDNode>(Val&: Op)) { |
8249 | MachineMemOperand *MemRef = MemOp->getMemOperand(); |
8250 | DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef}); |
8251 | } |
8252 | |
8253 | if (BaseOpcode->AtomicX2) { |
8254 | SmallVector<SDValue, 1> Elt; |
8255 | DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1); |
8256 | return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL); |
8257 | } |
8258 | if (BaseOpcode->NoReturn) |
8259 | return SDValue(NewNode, 0); |
8260 | return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail, |
8261 | Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes, |
8262 | NumVDataDwords, IsAtomicPacked16Bit, DL); |
8263 | } |
8264 | |
8265 | SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, |
8266 | SDValue Offset, SDValue CachePolicy, |
8267 | SelectionDAG &DAG) const { |
8268 | MachineFunction &MF = DAG.getMachineFunction(); |
8269 | |
8270 | const DataLayout &DataLayout = DAG.getDataLayout(); |
8271 | Align Alignment = |
8272 | DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext())); |
8273 | |
8274 | MachineMemOperand *MMO = MF.getMachineMemOperand( |
8275 | PtrInfo: MachinePointerInfo(), |
8276 | F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | |
8277 | MachineMemOperand::MOInvariant, |
8278 | Size: VT.getStoreSize(), BaseAlignment: Alignment); |
8279 | |
8280 | if (!Offset->isDivergent()) { |
8281 | SDValue Ops[] = {Rsrc, Offset, CachePolicy}; |
8282 | |
8283 | // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the |
8284 | // s_buffer_load_u16 instruction is emitted for both signed and unsigned |
8285 | // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext |
8286 | // and generates s_buffer_load_i16 (performSignExtendInRegCombine). |
8287 | if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { |
8288 | SDValue BufferLoad = |
8289 | DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL, |
8290 | VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO); |
8291 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad); |
8292 | } |
8293 | |
8294 | // Widen vec3 load to vec4. |
8295 | if (VT.isVector() && VT.getVectorNumElements() == 3 && |
8296 | !Subtarget->hasScalarDwordx3Loads()) { |
8297 | EVT WidenedVT = |
8298 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4); |
8299 | auto WidenedOp = DAG.getMemIntrinsicNode( |
8300 | Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT, |
8301 | MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize())); |
8302 | auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp, |
8303 | N2: DAG.getVectorIdxConstant(Val: 0, DL)); |
8304 | return Subvector; |
8305 | } |
8306 | |
8307 | return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, |
8308 | VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO); |
8309 | } |
8310 | |
8311 | // We have a divergent offset. Emit a MUBUF buffer load instead. We can |
8312 | // assume that the buffer is unswizzled. |
8313 | SDValue Ops[] = { |
8314 | DAG.getEntryNode(), // Chain |
8315 | Rsrc, // rsrc |
8316 | DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex |
8317 | {}, // voffset |
8318 | {}, // soffset |
8319 | {}, // offset |
8320 | CachePolicy, // cachepolicy |
8321 | DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen |
8322 | }; |
8323 | if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { |
8324 | setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4)); |
8325 | return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO); |
8326 | } |
8327 | |
8328 | SmallVector<SDValue, 4> Loads; |
8329 | unsigned NumLoads = 1; |
8330 | MVT LoadVT = VT.getSimpleVT(); |
8331 | unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; |
8332 | assert((LoadVT.getScalarType() == MVT::i32 || |
8333 | LoadVT.getScalarType() == MVT::f32)); |
8334 | |
8335 | if (NumElts == 8 || NumElts == 16) { |
8336 | NumLoads = NumElts / 4; |
8337 | LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4); |
8338 | } |
8339 | |
8340 | SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Glue}); |
8341 | |
8342 | // Use the alignment to ensure that the required offsets will fit into the |
8343 | // immediate offsets. |
8344 | setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], |
8345 | Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); |
8346 | |
8347 | uint64_t InstOffset = Ops[5]->getAsZExtVal(); |
8348 | for (unsigned i = 0; i < NumLoads; ++i) { |
8349 | Ops[5] = DAG.getTargetConstant(Val: InstOffset + 16 * i, DL, VT: MVT::i32); |
8350 | Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, |
8351 | MemVT: LoadVT, MMO, DAG)); |
8352 | } |
8353 | |
8354 | if (NumElts == 8 || NumElts == 16) |
8355 | return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads); |
8356 | |
8357 | return Loads[0]; |
8358 | } |
8359 | |
8360 | SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const { |
8361 | // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. |
8362 | if (!Subtarget->hasArchitectedSGPRs()) |
8363 | return {}; |
8364 | SDLoc SL(Op); |
8365 | MVT VT = MVT::i32; |
8366 | SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT); |
8367 | return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8, |
8368 | N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT)); |
8369 | } |
8370 | |
8371 | SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op, |
8372 | unsigned Dim, |
8373 | const ArgDescriptor &Arg) const { |
8374 | SDLoc SL(Op); |
8375 | MachineFunction &MF = DAG.getMachineFunction(); |
8376 | unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim); |
8377 | if (MaxID == 0) |
8378 | return DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32); |
8379 | |
8380 | SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32, |
8381 | SL: SDLoc(DAG.getEntryNode()), Arg); |
8382 | |
8383 | // Don't bother inserting AssertZext for packed IDs since we're emitting the |
8384 | // masking operations anyway. |
8385 | // |
8386 | // TODO: We could assert the top bit is 0 for the source copy. |
8387 | if (Arg.isMasked()) |
8388 | return Val; |
8389 | |
8390 | // Preserve the known bits after expansion to a copy. |
8391 | EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID)); |
8392 | return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val, |
8393 | N2: DAG.getValueType(SmallVT)); |
8394 | } |
8395 | |
8396 | SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, |
8397 | SelectionDAG &DAG) const { |
8398 | MachineFunction &MF = DAG.getMachineFunction(); |
8399 | auto MFI = MF.getInfo<SIMachineFunctionInfo>(); |
8400 | |
8401 | EVT VT = Op.getValueType(); |
8402 | SDLoc DL(Op); |
8403 | unsigned IntrinsicID = Op.getConstantOperandVal(i: 0); |
8404 | |
8405 | // TODO: Should this propagate fast-math-flags? |
8406 | |
8407 | switch (IntrinsicID) { |
8408 | case Intrinsic::amdgcn_implicit_buffer_ptr: { |
8409 | if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction())) |
8410 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8411 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8412 | PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); |
8413 | } |
8414 | case Intrinsic::amdgcn_dispatch_ptr: |
8415 | case Intrinsic::amdgcn_queue_ptr: { |
8416 | if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) { |
8417 | DiagnosticInfoUnsupported BadIntrin( |
8418 | MF.getFunction(), "unsupported hsa intrinsic without hsa target" , |
8419 | DL.getDebugLoc()); |
8420 | DAG.getContext()->diagnose(DI: BadIntrin); |
8421 | return DAG.getUNDEF(VT); |
8422 | } |
8423 | |
8424 | auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? |
8425 | AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR; |
8426 | return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID); |
8427 | } |
8428 | case Intrinsic::amdgcn_implicitarg_ptr: { |
8429 | if (MFI->isEntryFunction()) |
8430 | return getImplicitArgPtr(DAG, SL: DL); |
8431 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8432 | PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); |
8433 | } |
8434 | case Intrinsic::amdgcn_kernarg_segment_ptr: { |
8435 | if (!AMDGPU::isKernel(CC: MF.getFunction().getCallingConv())) { |
8436 | // This only makes sense to call in a kernel, so just lower to null. |
8437 | return DAG.getConstant(Val: 0, DL, VT); |
8438 | } |
8439 | |
8440 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8441 | PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); |
8442 | } |
8443 | case Intrinsic::amdgcn_dispatch_id: { |
8444 | return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID); |
8445 | } |
8446 | case Intrinsic::amdgcn_rcp: |
8447 | return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1)); |
8448 | case Intrinsic::amdgcn_rsq: |
8449 | return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1)); |
8450 | case Intrinsic::amdgcn_rsq_legacy: |
8451 | if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8452 | return emitRemovedIntrinsicError(DAG, DL, VT); |
8453 | return SDValue(); |
8454 | case Intrinsic::amdgcn_rcp_legacy: |
8455 | if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8456 | return emitRemovedIntrinsicError(DAG, DL, VT); |
8457 | return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1)); |
8458 | case Intrinsic::amdgcn_rsq_clamp: { |
8459 | if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8460 | return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1)); |
8461 | |
8462 | Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext()); |
8463 | APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics()); |
8464 | APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true); |
8465 | |
8466 | SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1)); |
8467 | SDValue Tmp = DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, |
8468 | N2: DAG.getConstantFP(Val: Max, DL, VT)); |
8469 | return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp, |
8470 | N2: DAG.getConstantFP(Val: Min, DL, VT)); |
8471 | } |
8472 | case Intrinsic::r600_read_ngroups_x: |
8473 | if (Subtarget->isAmdHsaOS()) |
8474 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8475 | |
8476 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8477 | Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4), |
8478 | Signed: false); |
8479 | case Intrinsic::r600_read_ngroups_y: |
8480 | if (Subtarget->isAmdHsaOS()) |
8481 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8482 | |
8483 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8484 | Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4), |
8485 | Signed: false); |
8486 | case Intrinsic::r600_read_ngroups_z: |
8487 | if (Subtarget->isAmdHsaOS()) |
8488 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8489 | |
8490 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8491 | Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4), |
8492 | Signed: false); |
8493 | case Intrinsic::r600_read_global_size_x: |
8494 | if (Subtarget->isAmdHsaOS()) |
8495 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8496 | |
8497 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8498 | Offset: SI::KernelInputOffsets::GLOBAL_SIZE_X, |
8499 | Alignment: Align(4), Signed: false); |
8500 | case Intrinsic::r600_read_global_size_y: |
8501 | if (Subtarget->isAmdHsaOS()) |
8502 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8503 | |
8504 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8505 | Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Y, |
8506 | Alignment: Align(4), Signed: false); |
8507 | case Intrinsic::r600_read_global_size_z: |
8508 | if (Subtarget->isAmdHsaOS()) |
8509 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8510 | |
8511 | return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(), |
8512 | Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Z, |
8513 | Alignment: Align(4), Signed: false); |
8514 | case Intrinsic::r600_read_local_size_x: |
8515 | if (Subtarget->isAmdHsaOS()) |
8516 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8517 | |
8518 | return lowerImplicitZextParam(DAG, Op, VT: MVT::i16, |
8519 | Offset: SI::KernelInputOffsets::LOCAL_SIZE_X); |
8520 | case Intrinsic::r600_read_local_size_y: |
8521 | if (Subtarget->isAmdHsaOS()) |
8522 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8523 | |
8524 | return lowerImplicitZextParam(DAG, Op, VT: MVT::i16, |
8525 | Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y); |
8526 | case Intrinsic::r600_read_local_size_z: |
8527 | if (Subtarget->isAmdHsaOS()) |
8528 | return emitNonHSAIntrinsicError(DAG, DL, VT); |
8529 | |
8530 | return lowerImplicitZextParam(DAG, Op, VT: MVT::i16, |
8531 | Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z); |
8532 | case Intrinsic::amdgcn_workgroup_id_x: |
8533 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8534 | PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X); |
8535 | case Intrinsic::amdgcn_workgroup_id_y: |
8536 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8537 | PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); |
8538 | case Intrinsic::amdgcn_workgroup_id_z: |
8539 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8540 | PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); |
8541 | case Intrinsic::amdgcn_wave_id: |
8542 | return lowerWaveID(DAG, Op); |
8543 | case Intrinsic::amdgcn_lds_kernel_id: { |
8544 | if (MFI->isEntryFunction()) |
8545 | return getLDSKernelId(DAG, SL: DL); |
8546 | return getPreloadedValue(DAG, MFI: *MFI, VT, |
8547 | PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID); |
8548 | } |
8549 | case Intrinsic::amdgcn_workitem_id_x: |
8550 | return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX); |
8551 | case Intrinsic::amdgcn_workitem_id_y: |
8552 | return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY); |
8553 | case Intrinsic::amdgcn_workitem_id_z: |
8554 | return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ); |
8555 | case Intrinsic::amdgcn_wavefrontsize: |
8556 | return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), |
8557 | DL: SDLoc(Op), VT: MVT::i32); |
8558 | case Intrinsic::amdgcn_s_buffer_load: { |
8559 | unsigned CPol = Op.getConstantOperandVal(i: 3); |
8560 | // s_buffer_load, because of how it's optimized, can't be volatile |
8561 | // so reject ones with the volatile bit set. |
8562 | if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) |
8563 | ? AMDGPU::CPol::ALL |
8564 | : AMDGPU::CPol::ALL_pregfx12)) |
8565 | return Op; |
8566 | return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2), CachePolicy: Op.getOperand(i: 3), |
8567 | DAG); |
8568 | } |
8569 | case Intrinsic::amdgcn_fdiv_fast: |
8570 | return lowerFDIV_FAST(Op, DAG); |
8571 | case Intrinsic::amdgcn_sin: |
8572 | return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1)); |
8573 | |
8574 | case Intrinsic::amdgcn_cos: |
8575 | return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1)); |
8576 | |
8577 | case Intrinsic::amdgcn_mul_u24: |
8578 | return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8579 | case Intrinsic::amdgcn_mul_i24: |
8580 | return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8581 | |
8582 | case Intrinsic::amdgcn_log_clamp: { |
8583 | if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) |
8584 | return SDValue(); |
8585 | |
8586 | return emitRemovedIntrinsicError(DAG, DL, VT); |
8587 | } |
8588 | case Intrinsic::amdgcn_fract: |
8589 | return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1)); |
8590 | |
8591 | case Intrinsic::amdgcn_class: |
8592 | return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, |
8593 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8594 | case Intrinsic::amdgcn_div_fmas: |
8595 | return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, |
8596 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), |
8597 | N4: Op.getOperand(i: 4)); |
8598 | |
8599 | case Intrinsic::amdgcn_div_fixup: |
8600 | return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, |
8601 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8602 | |
8603 | case Intrinsic::amdgcn_div_scale: { |
8604 | const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3)); |
8605 | |
8606 | // Translate to the operands expected by the machine instruction. The |
8607 | // first parameter must be the same as the first instruction. |
8608 | SDValue Numerator = Op.getOperand(i: 1); |
8609 | SDValue Denominator = Op.getOperand(i: 2); |
8610 | |
8611 | // Note this order is opposite of the machine instruction's operations, |
8612 | // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The |
8613 | // intrinsic has the numerator as the first operand to match a normal |
8614 | // division operation. |
8615 | |
8616 | SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator; |
8617 | |
8618 | return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0, |
8619 | N2: Denominator, N3: Numerator); |
8620 | } |
8621 | case Intrinsic::amdgcn_icmp: { |
8622 | // There is a Pat that handles this variant, so return it as-is. |
8623 | if (Op.getOperand(i: 1).getValueType() == MVT::i1 && |
8624 | Op.getConstantOperandVal(i: 2) == 0 && |
8625 | Op.getConstantOperandVal(i: 3) == ICmpInst::Predicate::ICMP_NE) |
8626 | return Op; |
8627 | return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG); |
8628 | } |
8629 | case Intrinsic::amdgcn_fcmp: { |
8630 | return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG); |
8631 | } |
8632 | case Intrinsic::amdgcn_ballot: |
8633 | return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG); |
8634 | case Intrinsic::amdgcn_fmed3: |
8635 | return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, |
8636 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8637 | case Intrinsic::amdgcn_fdot2: |
8638 | return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, |
8639 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), |
8640 | N4: Op.getOperand(i: 4)); |
8641 | case Intrinsic::amdgcn_fmul_legacy: |
8642 | return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, |
8643 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8644 | case Intrinsic::amdgcn_sffbh: |
8645 | return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: 1)); |
8646 | case Intrinsic::amdgcn_sbfe: |
8647 | return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, |
8648 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8649 | case Intrinsic::amdgcn_ubfe: |
8650 | return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, |
8651 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8652 | case Intrinsic::amdgcn_cvt_pkrtz: |
8653 | case Intrinsic::amdgcn_cvt_pknorm_i16: |
8654 | case Intrinsic::amdgcn_cvt_pknorm_u16: |
8655 | case Intrinsic::amdgcn_cvt_pk_i16: |
8656 | case Intrinsic::amdgcn_cvt_pk_u16: { |
8657 | // FIXME: Stop adding cast if v2f16/v2i16 are legal. |
8658 | EVT VT = Op.getValueType(); |
8659 | unsigned Opcode; |
8660 | |
8661 | if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz) |
8662 | Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32; |
8663 | else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16) |
8664 | Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; |
8665 | else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16) |
8666 | Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; |
8667 | else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16) |
8668 | Opcode = AMDGPUISD::CVT_PK_I16_I32; |
8669 | else |
8670 | Opcode = AMDGPUISD::CVT_PK_U16_U32; |
8671 | |
8672 | if (isTypeLegal(VT)) |
8673 | return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8674 | |
8675 | SDValue Node = DAG.getNode(Opcode, DL, VT: MVT::i32, |
8676 | N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2)); |
8677 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node); |
8678 | } |
8679 | case Intrinsic::amdgcn_fmad_ftz: |
8680 | return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1), |
8681 | N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8682 | |
8683 | case Intrinsic::amdgcn_if_break: |
8684 | return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT, |
8685 | Op1: Op->getOperand(Num: 1), Op2: Op->getOperand(Num: 2)), 0); |
8686 | |
8687 | case Intrinsic::amdgcn_groupstaticsize: { |
8688 | Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); |
8689 | if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) |
8690 | return Op; |
8691 | |
8692 | const Module *M = MF.getFunction().getParent(); |
8693 | const GlobalValue *GV = |
8694 | M->getNamedValue(Name: Intrinsic::getName(id: Intrinsic::amdgcn_groupstaticsize)); |
8695 | SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: 0, |
8696 | TargetFlags: SIInstrInfo::MO_ABS32_LO); |
8697 | return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0}; |
8698 | } |
8699 | case Intrinsic::amdgcn_is_shared: |
8700 | case Intrinsic::amdgcn_is_private: { |
8701 | SDLoc SL(Op); |
8702 | unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ? |
8703 | AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; |
8704 | SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG); |
8705 | SDValue SrcVec = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, |
8706 | Operand: Op.getOperand(i: 1)); |
8707 | |
8708 | SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec, |
8709 | N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)); |
8710 | return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ); |
8711 | } |
8712 | case Intrinsic::amdgcn_perm: |
8713 | return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: 1), |
8714 | N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3)); |
8715 | case Intrinsic::amdgcn_reloc_constant: { |
8716 | Module *M = const_cast<Module *>(MF.getFunction().getParent()); |
8717 | const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD(); |
8718 | auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString(); |
8719 | auto RelocSymbol = cast<GlobalVariable>( |
8720 | Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext()))); |
8721 | SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: 0, |
8722 | TargetFlags: SIInstrInfo::MO_ABS32_LO); |
8723 | return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0}; |
8724 | } |
8725 | case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: |
8726 | case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: |
8727 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: |
8728 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: |
8729 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: |
8730 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: |
8731 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: |
8732 | case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { |
8733 | if (Op.getOperand(i: 4).getValueType() == MVT::i32) |
8734 | return SDValue(); |
8735 | |
8736 | SDLoc SL(Op); |
8737 | auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i32); |
8738 | return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(), |
8739 | N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2), |
8740 | N4: Op.getOperand(i: 3), N5: IndexKeyi32); |
8741 | } |
8742 | case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: |
8743 | case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: |
8744 | case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { |
8745 | if (Op.getOperand(i: 6).getValueType() == MVT::i32) |
8746 | return SDValue(); |
8747 | |
8748 | SDLoc SL(Op); |
8749 | auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: MVT::i32); |
8750 | return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(), |
8751 | Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2), |
8752 | Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5), |
8753 | IndexKeyi32, Op.getOperand(i: 7)}); |
8754 | } |
8755 | case Intrinsic::amdgcn_addrspacecast_nonnull: |
8756 | return lowerADDRSPACECAST(Op, DAG); |
8757 | case Intrinsic::amdgcn_readlane: |
8758 | case Intrinsic::amdgcn_readfirstlane: |
8759 | case Intrinsic::amdgcn_writelane: |
8760 | case Intrinsic::amdgcn_permlane16: |
8761 | case Intrinsic::amdgcn_permlanex16: |
8762 | case Intrinsic::amdgcn_permlane64: |
8763 | return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG); |
8764 | default: |
8765 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
8766 | AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID)) |
8767 | return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false); |
8768 | |
8769 | return Op; |
8770 | } |
8771 | } |
8772 | |
8773 | // On targets not supporting constant in soffset field, turn zero to |
8774 | // SGPR_NULL to avoid generating an extra s_mov with zero. |
8775 | static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, |
8776 | const GCNSubtarget *Subtarget) { |
8777 | if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset)) |
8778 | return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32); |
8779 | return SOffset; |
8780 | } |
8781 | |
8782 | SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, |
8783 | SelectionDAG &DAG, |
8784 | unsigned NewOpcode) const { |
8785 | SDLoc DL(Op); |
8786 | |
8787 | SDValue VData = Op.getOperand(i: 2); |
8788 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
8789 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
8790 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
8791 | SDValue Ops[] = { |
8792 | Op.getOperand(i: 0), // Chain |
8793 | VData, // vdata |
8794 | Rsrc, // rsrc |
8795 | DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex |
8796 | Offsets.first, // voffset |
8797 | SOffset, // soffset |
8798 | Offsets.second, // offset |
8799 | Op.getOperand(i: 6), // cachepolicy |
8800 | DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen |
8801 | }; |
8802 | |
8803 | auto *M = cast<MemSDNode>(Val&: Op); |
8804 | |
8805 | EVT MemVT = VData.getValueType(); |
8806 | return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT, |
8807 | MMO: M->getMemOperand()); |
8808 | } |
8809 | |
8810 | SDValue |
8811 | SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, |
8812 | unsigned NewOpcode) const { |
8813 | SDLoc DL(Op); |
8814 | |
8815 | SDValue VData = Op.getOperand(i: 2); |
8816 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
8817 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG); |
8818 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget); |
8819 | SDValue Ops[] = { |
8820 | Op.getOperand(i: 0), // Chain |
8821 | VData, // vdata |
8822 | Rsrc, // rsrc |
8823 | Op.getOperand(i: 4), // vindex |
8824 | Offsets.first, // voffset |
8825 | SOffset, // soffset |
8826 | Offsets.second, // offset |
8827 | Op.getOperand(i: 7), // cachepolicy |
8828 | DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen |
8829 | }; |
8830 | |
8831 | auto *M = cast<MemSDNode>(Val&: Op); |
8832 | |
8833 | EVT MemVT = VData.getValueType(); |
8834 | return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT, |
8835 | MMO: M->getMemOperand()); |
8836 | } |
8837 | |
8838 | SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, |
8839 | SelectionDAG &DAG) const { |
8840 | unsigned IntrID = Op.getConstantOperandVal(i: 1); |
8841 | SDLoc DL(Op); |
8842 | |
8843 | switch (IntrID) { |
8844 | case Intrinsic::amdgcn_ds_ordered_add: |
8845 | case Intrinsic::amdgcn_ds_ordered_swap: { |
8846 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8847 | SDValue Chain = M->getOperand(Num: 0); |
8848 | SDValue M0 = M->getOperand(Num: 2); |
8849 | SDValue Value = M->getOperand(Num: 3); |
8850 | unsigned IndexOperand = M->getConstantOperandVal(Num: 7); |
8851 | unsigned WaveRelease = M->getConstantOperandVal(Num: 8); |
8852 | unsigned WaveDone = M->getConstantOperandVal(Num: 9); |
8853 | |
8854 | unsigned OrderedCountIndex = IndexOperand & 0x3f; |
8855 | IndexOperand &= ~0x3f; |
8856 | unsigned CountDw = 0; |
8857 | |
8858 | if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { |
8859 | CountDw = (IndexOperand >> 24) & 0xf; |
8860 | IndexOperand &= ~(0xf << 24); |
8861 | |
8862 | if (CountDw < 1 || CountDw > 4) { |
8863 | report_fatal_error( |
8864 | reason: "ds_ordered_count: dword count must be between 1 and 4" ); |
8865 | } |
8866 | } |
8867 | |
8868 | if (IndexOperand) |
8869 | report_fatal_error(reason: "ds_ordered_count: bad index operand" ); |
8870 | |
8871 | if (WaveDone && !WaveRelease) |
8872 | report_fatal_error(reason: "ds_ordered_count: wave_done requires wave_release" ); |
8873 | |
8874 | unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; |
8875 | unsigned ShaderType = |
8876 | SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction()); |
8877 | unsigned Offset0 = OrderedCountIndex << 2; |
8878 | unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); |
8879 | |
8880 | if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) |
8881 | Offset1 |= (CountDw - 1) << 6; |
8882 | |
8883 | if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11) |
8884 | Offset1 |= ShaderType << 2; |
8885 | |
8886 | unsigned Offset = Offset0 | (Offset1 << 8); |
8887 | |
8888 | SDValue Ops[] = { |
8889 | Chain, |
8890 | Value, |
8891 | DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16), |
8892 | copyToM0(DAG, Chain, DL, V: M0).getValue(R: 1), // Glue |
8893 | }; |
8894 | return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL, |
8895 | VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(), |
8896 | MMO: M->getMemOperand()); |
8897 | } |
8898 | case Intrinsic::amdgcn_raw_buffer_load: |
8899 | case Intrinsic::amdgcn_raw_ptr_buffer_load: |
8900 | case Intrinsic::amdgcn_raw_atomic_buffer_load: |
8901 | case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: |
8902 | case Intrinsic::amdgcn_raw_buffer_load_format: |
8903 | case Intrinsic::amdgcn_raw_ptr_buffer_load_format: { |
8904 | const bool IsFormat = |
8905 | IntrID == Intrinsic::amdgcn_raw_buffer_load_format || |
8906 | IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format; |
8907 | |
8908 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
8909 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG); |
8910 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget); |
8911 | SDValue Ops[] = { |
8912 | Op.getOperand(i: 0), // Chain |
8913 | Rsrc, // rsrc |
8914 | DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex |
8915 | Offsets.first, // voffset |
8916 | SOffset, // soffset |
8917 | Offsets.second, // offset |
8918 | Op.getOperand(i: 5), // cachepolicy, swizzled buffer |
8919 | DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen |
8920 | }; |
8921 | |
8922 | auto *M = cast<MemSDNode>(Val&: Op); |
8923 | return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); |
8924 | } |
8925 | case Intrinsic::amdgcn_struct_buffer_load: |
8926 | case Intrinsic::amdgcn_struct_ptr_buffer_load: |
8927 | case Intrinsic::amdgcn_struct_buffer_load_format: |
8928 | case Intrinsic::amdgcn_struct_ptr_buffer_load_format: { |
8929 | const bool IsFormat = |
8930 | IntrID == Intrinsic::amdgcn_struct_buffer_load_format || |
8931 | IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format; |
8932 | |
8933 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
8934 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
8935 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
8936 | SDValue Ops[] = { |
8937 | Op.getOperand(i: 0), // Chain |
8938 | Rsrc, // rsrc |
8939 | Op.getOperand(i: 3), // vindex |
8940 | Offsets.first, // voffset |
8941 | SOffset, // soffset |
8942 | Offsets.second, // offset |
8943 | Op.getOperand(i: 6), // cachepolicy, swizzled buffer |
8944 | DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen |
8945 | }; |
8946 | |
8947 | return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops); |
8948 | } |
8949 | case Intrinsic::amdgcn_raw_tbuffer_load: |
8950 | case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { |
8951 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8952 | EVT LoadVT = Op.getValueType(); |
8953 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
8954 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG); |
8955 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget); |
8956 | |
8957 | SDValue Ops[] = { |
8958 | Op.getOperand(i: 0), // Chain |
8959 | Rsrc, // rsrc |
8960 | DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex |
8961 | Offsets.first, // voffset |
8962 | SOffset, // soffset |
8963 | Offsets.second, // offset |
8964 | Op.getOperand(i: 5), // format |
8965 | Op.getOperand(i: 6), // cachepolicy, swizzled buffer |
8966 | DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen |
8967 | }; |
8968 | |
8969 | if (LoadVT.getScalarType() == MVT::f16) |
8970 | return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, |
8971 | M, DAG, Ops); |
8972 | return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, |
8973 | VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(), |
8974 | DAG); |
8975 | } |
8976 | case Intrinsic::amdgcn_struct_tbuffer_load: |
8977 | case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { |
8978 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
8979 | EVT LoadVT = Op.getValueType(); |
8980 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
8981 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
8982 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
8983 | |
8984 | SDValue Ops[] = { |
8985 | Op.getOperand(i: 0), // Chain |
8986 | Rsrc, // rsrc |
8987 | Op.getOperand(i: 3), // vindex |
8988 | Offsets.first, // voffset |
8989 | SOffset, // soffset |
8990 | Offsets.second, // offset |
8991 | Op.getOperand(i: 6), // format |
8992 | Op.getOperand(i: 7), // cachepolicy, swizzled buffer |
8993 | DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen |
8994 | }; |
8995 | |
8996 | if (LoadVT.getScalarType() == MVT::f16) |
8997 | return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, |
8998 | M, DAG, Ops); |
8999 | return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, |
9000 | VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(), |
9001 | DAG); |
9002 | } |
9003 | case Intrinsic::amdgcn_raw_buffer_atomic_fadd: |
9004 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: |
9005 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD); |
9006 | case Intrinsic::amdgcn_struct_buffer_atomic_fadd: |
9007 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: |
9008 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD); |
9009 | case Intrinsic::amdgcn_raw_buffer_atomic_fmin: |
9010 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: |
9011 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN); |
9012 | case Intrinsic::amdgcn_struct_buffer_atomic_fmin: |
9013 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: |
9014 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN); |
9015 | case Intrinsic::amdgcn_raw_buffer_atomic_fmax: |
9016 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: |
9017 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX); |
9018 | case Intrinsic::amdgcn_struct_buffer_atomic_fmax: |
9019 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: |
9020 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX); |
9021 | case Intrinsic::amdgcn_raw_buffer_atomic_swap: |
9022 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: |
9023 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP); |
9024 | case Intrinsic::amdgcn_raw_buffer_atomic_add: |
9025 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: |
9026 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD); |
9027 | case Intrinsic::amdgcn_raw_buffer_atomic_sub: |
9028 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: |
9029 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB); |
9030 | case Intrinsic::amdgcn_raw_buffer_atomic_smin: |
9031 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: |
9032 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN); |
9033 | case Intrinsic::amdgcn_raw_buffer_atomic_umin: |
9034 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: |
9035 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN); |
9036 | case Intrinsic::amdgcn_raw_buffer_atomic_smax: |
9037 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: |
9038 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX); |
9039 | case Intrinsic::amdgcn_raw_buffer_atomic_umax: |
9040 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: |
9041 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX); |
9042 | case Intrinsic::amdgcn_raw_buffer_atomic_and: |
9043 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: |
9044 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND); |
9045 | case Intrinsic::amdgcn_raw_buffer_atomic_or: |
9046 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: |
9047 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR); |
9048 | case Intrinsic::amdgcn_raw_buffer_atomic_xor: |
9049 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: |
9050 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR); |
9051 | case Intrinsic::amdgcn_raw_buffer_atomic_inc: |
9052 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: |
9053 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC); |
9054 | case Intrinsic::amdgcn_raw_buffer_atomic_dec: |
9055 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: |
9056 | return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC); |
9057 | case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: |
9058 | return lowerRawBufferAtomicIntrin(Op, DAG, |
9059 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); |
9060 | case Intrinsic::amdgcn_struct_buffer_atomic_swap: |
9061 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: |
9062 | return lowerStructBufferAtomicIntrin(Op, DAG, |
9063 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP); |
9064 | case Intrinsic::amdgcn_struct_buffer_atomic_add: |
9065 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: |
9066 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD); |
9067 | case Intrinsic::amdgcn_struct_buffer_atomic_sub: |
9068 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: |
9069 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB); |
9070 | case Intrinsic::amdgcn_struct_buffer_atomic_smin: |
9071 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: |
9072 | return lowerStructBufferAtomicIntrin(Op, DAG, |
9073 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN); |
9074 | case Intrinsic::amdgcn_struct_buffer_atomic_umin: |
9075 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: |
9076 | return lowerStructBufferAtomicIntrin(Op, DAG, |
9077 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN); |
9078 | case Intrinsic::amdgcn_struct_buffer_atomic_smax: |
9079 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: |
9080 | return lowerStructBufferAtomicIntrin(Op, DAG, |
9081 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX); |
9082 | case Intrinsic::amdgcn_struct_buffer_atomic_umax: |
9083 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: |
9084 | return lowerStructBufferAtomicIntrin(Op, DAG, |
9085 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX); |
9086 | case Intrinsic::amdgcn_struct_buffer_atomic_and: |
9087 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: |
9088 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND); |
9089 | case Intrinsic::amdgcn_struct_buffer_atomic_or: |
9090 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: |
9091 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR); |
9092 | case Intrinsic::amdgcn_struct_buffer_atomic_xor: |
9093 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: |
9094 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR); |
9095 | case Intrinsic::amdgcn_struct_buffer_atomic_inc: |
9096 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: |
9097 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC); |
9098 | case Intrinsic::amdgcn_struct_buffer_atomic_dec: |
9099 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: |
9100 | return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC); |
9101 | case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: |
9102 | return lowerStructBufferAtomicIntrin(Op, DAG, |
9103 | NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); |
9104 | |
9105 | case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: |
9106 | case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { |
9107 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG); |
9108 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG); |
9109 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget); |
9110 | SDValue Ops[] = { |
9111 | Op.getOperand(i: 0), // Chain |
9112 | Op.getOperand(i: 2), // src |
9113 | Op.getOperand(i: 3), // cmp |
9114 | Rsrc, // rsrc |
9115 | DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex |
9116 | Offsets.first, // voffset |
9117 | SOffset, // soffset |
9118 | Offsets.second, // offset |
9119 | Op.getOperand(i: 7), // cachepolicy |
9120 | DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen |
9121 | }; |
9122 | EVT VT = Op.getValueType(); |
9123 | auto *M = cast<MemSDNode>(Val&: Op); |
9124 | |
9125 | return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL, |
9126 | VTList: Op->getVTList(), Ops, MemVT: VT, MMO: M->getMemOperand()); |
9127 | } |
9128 | case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: |
9129 | case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: { |
9130 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG); |
9131 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG); |
9132 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget); |
9133 | SDValue Ops[] = { |
9134 | Op.getOperand(i: 0), // Chain |
9135 | Op.getOperand(i: 2), // src |
9136 | Op.getOperand(i: 3), // cmp |
9137 | Rsrc, // rsrc |
9138 | Op.getOperand(i: 5), // vindex |
9139 | Offsets.first, // voffset |
9140 | SOffset, // soffset |
9141 | Offsets.second, // offset |
9142 | Op.getOperand(i: 8), // cachepolicy |
9143 | DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen |
9144 | }; |
9145 | EVT VT = Op.getValueType(); |
9146 | auto *M = cast<MemSDNode>(Val&: Op); |
9147 | |
9148 | return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL, |
9149 | VTList: Op->getVTList(), Ops, MemVT: VT, MMO: M->getMemOperand()); |
9150 | } |
9151 | case Intrinsic::amdgcn_image_bvh_intersect_ray: { |
9152 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9153 | SDValue NodePtr = M->getOperand(Num: 2); |
9154 | SDValue RayExtent = M->getOperand(Num: 3); |
9155 | SDValue RayOrigin = M->getOperand(Num: 4); |
9156 | SDValue RayDir = M->getOperand(Num: 5); |
9157 | SDValue RayInvDir = M->getOperand(Num: 6); |
9158 | SDValue TDescr = M->getOperand(Num: 7); |
9159 | |
9160 | assert(NodePtr.getValueType() == MVT::i32 || |
9161 | NodePtr.getValueType() == MVT::i64); |
9162 | assert(RayDir.getValueType() == MVT::v3f16 || |
9163 | RayDir.getValueType() == MVT::v3f32); |
9164 | |
9165 | if (!Subtarget->hasGFX10_AEncoding()) { |
9166 | emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType()); |
9167 | return SDValue(); |
9168 | } |
9169 | |
9170 | const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget); |
9171 | const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget); |
9172 | const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget); |
9173 | const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; |
9174 | const bool Is64 = NodePtr.getValueType() == MVT::i64; |
9175 | const unsigned NumVDataDwords = 4; |
9176 | const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); |
9177 | const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; |
9178 | const bool UseNSA = (Subtarget->hasNSAEncoding() && |
9179 | NumVAddrs <= Subtarget->getNSAMaxSize()) || |
9180 | IsGFX12Plus; |
9181 | const unsigned BaseOpcodes[2][2] = { |
9182 | {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, |
9183 | {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, |
9184 | AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; |
9185 | int Opcode; |
9186 | if (UseNSA) { |
9187 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16], |
9188 | MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12 |
9189 | : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA |
9190 | : AMDGPU::MIMGEncGfx10NSA, |
9191 | VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords); |
9192 | } else { |
9193 | assert(!IsGFX12Plus); |
9194 | Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16], |
9195 | MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default |
9196 | : AMDGPU::MIMGEncGfx10Default, |
9197 | VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords); |
9198 | } |
9199 | assert(Opcode != -1); |
9200 | |
9201 | SmallVector<SDValue, 16> Ops; |
9202 | |
9203 | auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) { |
9204 | SmallVector<SDValue, 3> Lanes; |
9205 | DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3); |
9206 | if (Lanes[0].getValueSizeInBits() == 32) { |
9207 | for (unsigned I = 0; I < 3; ++I) |
9208 | Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes[I])); |
9209 | } else { |
9210 | if (IsAligned) { |
9211 | Ops.push_back( |
9212 | Elt: DAG.getBitcast(VT: MVT::i32, |
9213 | V: DAG.getBuildVector(VT: MVT::v2f16, DL, |
9214 | Ops: { Lanes[0], Lanes[1] }))); |
9215 | Ops.push_back(Elt: Lanes[2]); |
9216 | } else { |
9217 | SDValue Elt0 = Ops.pop_back_val(); |
9218 | Ops.push_back( |
9219 | Elt: DAG.getBitcast(VT: MVT::i32, |
9220 | V: DAG.getBuildVector(VT: MVT::v2f16, DL, |
9221 | Ops: { Elt0, Lanes[0] }))); |
9222 | Ops.push_back( |
9223 | Elt: DAG.getBitcast(VT: MVT::i32, |
9224 | V: DAG.getBuildVector(VT: MVT::v2f16, DL, |
9225 | Ops: { Lanes[1], Lanes[2] }))); |
9226 | } |
9227 | } |
9228 | }; |
9229 | |
9230 | if (UseNSA && IsGFX11Plus) { |
9231 | Ops.push_back(Elt: NodePtr); |
9232 | Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent)); |
9233 | Ops.push_back(Elt: RayOrigin); |
9234 | if (IsA16) { |
9235 | SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes; |
9236 | DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3); |
9237 | DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3); |
9238 | for (unsigned I = 0; I < 3; ++I) { |
9239 | MergedLanes.push_back(Elt: DAG.getBitcast( |
9240 | VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL, |
9241 | Ops: {DirLanes[I], InvDirLanes[I]}))); |
9242 | } |
9243 | Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes)); |
9244 | } else { |
9245 | Ops.push_back(Elt: RayDir); |
9246 | Ops.push_back(Elt: RayInvDir); |
9247 | } |
9248 | } else { |
9249 | if (Is64) |
9250 | DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: 0, |
9251 | Count: 2); |
9252 | else |
9253 | Ops.push_back(Elt: NodePtr); |
9254 | |
9255 | Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent)); |
9256 | packLanes(RayOrigin, true); |
9257 | packLanes(RayDir, true); |
9258 | packLanes(RayInvDir, false); |
9259 | } |
9260 | |
9261 | if (!UseNSA) { |
9262 | // Build a single vector containing all the operands so far prepared. |
9263 | if (NumVAddrDwords > 12) { |
9264 | SDValue Undef = DAG.getUNDEF(VT: MVT::i32); |
9265 | Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef); |
9266 | } |
9267 | assert(Ops.size() >= 8 && Ops.size() <= 12); |
9268 | SDValue MergedOps = DAG.getBuildVector( |
9269 | VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops); |
9270 | Ops.clear(); |
9271 | Ops.push_back(Elt: MergedOps); |
9272 | } |
9273 | |
9274 | Ops.push_back(Elt: TDescr); |
9275 | Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1)); |
9276 | Ops.push_back(Elt: M->getChain()); |
9277 | |
9278 | auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops); |
9279 | MachineMemOperand *MemRef = M->getMemOperand(); |
9280 | DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef}); |
9281 | return SDValue(NewNode, 0); |
9282 | } |
9283 | case Intrinsic::amdgcn_global_atomic_fmin: |
9284 | case Intrinsic::amdgcn_global_atomic_fmax: |
9285 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
9286 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
9287 | case Intrinsic::amdgcn_flat_atomic_fmin: |
9288 | case Intrinsic::amdgcn_flat_atomic_fmax: |
9289 | case Intrinsic::amdgcn_flat_atomic_fmin_num: |
9290 | case Intrinsic::amdgcn_flat_atomic_fmax_num: { |
9291 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9292 | SDValue Ops[] = { |
9293 | M->getOperand(Num: 0), // Chain |
9294 | M->getOperand(Num: 2), // Ptr |
9295 | M->getOperand(Num: 3) // Value |
9296 | }; |
9297 | unsigned Opcode = 0; |
9298 | switch (IntrID) { |
9299 | case Intrinsic::amdgcn_global_atomic_fmin: |
9300 | case Intrinsic::amdgcn_global_atomic_fmin_num: |
9301 | case Intrinsic::amdgcn_flat_atomic_fmin: |
9302 | case Intrinsic::amdgcn_flat_atomic_fmin_num: { |
9303 | Opcode = ISD::ATOMIC_LOAD_FMIN; |
9304 | break; |
9305 | } |
9306 | case Intrinsic::amdgcn_global_atomic_fmax: |
9307 | case Intrinsic::amdgcn_global_atomic_fmax_num: |
9308 | case Intrinsic::amdgcn_flat_atomic_fmax: |
9309 | case Intrinsic::amdgcn_flat_atomic_fmax_num: { |
9310 | Opcode = ISD::ATOMIC_LOAD_FMAX; |
9311 | break; |
9312 | } |
9313 | default: |
9314 | llvm_unreachable("unhandled atomic opcode" ); |
9315 | } |
9316 | return DAG.getAtomic(Opcode, dl: SDLoc(Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(), |
9317 | Ops, MMO: M->getMemOperand()); |
9318 | } |
9319 | case Intrinsic::amdgcn_s_get_barrier_state: { |
9320 | SDValue Chain = Op->getOperand(Num: 0); |
9321 | SmallVector<SDValue, 2> Ops; |
9322 | unsigned Opc; |
9323 | bool IsInlinableBarID = false; |
9324 | int64_t BarID; |
9325 | |
9326 | if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) { |
9327 | BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getSExtValue(); |
9328 | IsInlinableBarID = AMDGPU::isInlinableIntLiteral(Literal: BarID); |
9329 | } |
9330 | |
9331 | if (IsInlinableBarID) { |
9332 | Opc = AMDGPU::S_GET_BARRIER_STATE_IMM; |
9333 | SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32); |
9334 | Ops.push_back(Elt: K); |
9335 | } else { |
9336 | Opc = AMDGPU::S_GET_BARRIER_STATE_M0; |
9337 | SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 2)); |
9338 | Ops.push_back(Elt: M0Val.getValue(R: 0)); |
9339 | } |
9340 | |
9341 | auto NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops); |
9342 | return SDValue(NewMI, 0); |
9343 | } |
9344 | default: |
9345 | |
9346 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
9347 | AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID)) |
9348 | return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true); |
9349 | |
9350 | return SDValue(); |
9351 | } |
9352 | } |
9353 | |
9354 | // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to |
9355 | // dwordx4 if on SI and handle TFE loads. |
9356 | SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, |
9357 | SDVTList VTList, |
9358 | ArrayRef<SDValue> Ops, EVT MemVT, |
9359 | MachineMemOperand *MMO, |
9360 | SelectionDAG &DAG) const { |
9361 | LLVMContext &C = *DAG.getContext(); |
9362 | MachineFunction &MF = DAG.getMachineFunction(); |
9363 | EVT VT = VTList.VTs[0]; |
9364 | |
9365 | assert(VTList.NumVTs == 2 || VTList.NumVTs == 3); |
9366 | bool IsTFE = VTList.NumVTs == 3; |
9367 | if (IsTFE) { |
9368 | unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32); |
9369 | unsigned NumOpDWords = NumValueDWords + 1; |
9370 | EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords); |
9371 | SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]); |
9372 | MachineMemOperand *OpDWordsMMO = |
9373 | MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4); |
9374 | SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops, |
9375 | MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG); |
9376 | SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, |
9377 | N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL)); |
9378 | SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL); |
9379 | SDValue ValueDWords = |
9380 | NumValueDWords == 1 |
9381 | ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx) |
9382 | : DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, |
9383 | VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op, |
9384 | N2: ZeroIdx); |
9385 | SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords); |
9386 | return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL); |
9387 | } |
9388 | |
9389 | if (!Subtarget->hasDwordx3LoadStores() && |
9390 | (VT == MVT::v3i32 || VT == MVT::v3f32)) { |
9391 | EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4); |
9392 | EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4); |
9393 | MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16); |
9394 | SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]); |
9395 | SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops, |
9396 | MemVT: WidenedMemVT, MMO: WidenedMMO); |
9397 | SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op, |
9398 | N2: DAG.getVectorIdxConstant(Val: 0, DL)); |
9399 | return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL); |
9400 | } |
9401 | |
9402 | return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO); |
9403 | } |
9404 | |
9405 | SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, |
9406 | bool ImageStore) const { |
9407 | EVT StoreVT = VData.getValueType(); |
9408 | |
9409 | // No change for f16 and legal vector D16 types. |
9410 | if (!StoreVT.isVector()) |
9411 | return VData; |
9412 | |
9413 | SDLoc DL(VData); |
9414 | unsigned NumElements = StoreVT.getVectorNumElements(); |
9415 | |
9416 | if (Subtarget->hasUnpackedD16VMem()) { |
9417 | // We need to unpack the packed data to store. |
9418 | EVT IntStoreVT = StoreVT.changeTypeToInteger(); |
9419 | SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData); |
9420 | |
9421 | EVT EquivStoreVT = |
9422 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements); |
9423 | SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData); |
9424 | return DAG.UnrollVectorOp(N: ZExt.getNode()); |
9425 | } |
9426 | |
9427 | // The sq block of gfx8.1 does not estimate register use correctly for d16 |
9428 | // image store instructions. The data operand is computed as if it were not a |
9429 | // d16 image instruction. |
9430 | if (ImageStore && Subtarget->hasImageStoreD16Bug()) { |
9431 | // Bitcast to i16 |
9432 | EVT IntStoreVT = StoreVT.changeTypeToInteger(); |
9433 | SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData); |
9434 | |
9435 | // Decompose into scalars |
9436 | SmallVector<SDValue, 4> Elts; |
9437 | DAG.ExtractVectorElements(Op: IntVData, Args&: Elts); |
9438 | |
9439 | // Group pairs of i16 into v2i16 and bitcast to i32 |
9440 | SmallVector<SDValue, 4> PackedElts; |
9441 | for (unsigned I = 0; I < Elts.size() / 2; I += 1) { |
9442 | SDValue Pair = |
9443 | DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts[I * 2], Elts[I * 2 + 1]}); |
9444 | SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair); |
9445 | PackedElts.push_back(Elt: IntPair); |
9446 | } |
9447 | if ((NumElements % 2) == 1) { |
9448 | // Handle v3i16 |
9449 | unsigned I = Elts.size() / 2; |
9450 | SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL, |
9451 | Ops: {Elts[I * 2], DAG.getUNDEF(VT: MVT::i16)}); |
9452 | SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair); |
9453 | PackedElts.push_back(Elt: IntPair); |
9454 | } |
9455 | |
9456 | // Pad using UNDEF |
9457 | PackedElts.resize(N: Elts.size(), NV: DAG.getUNDEF(VT: MVT::i32)); |
9458 | |
9459 | // Build final vector |
9460 | EVT VecVT = |
9461 | EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size()); |
9462 | return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts); |
9463 | } |
9464 | |
9465 | if (NumElements == 3) { |
9466 | EVT IntStoreVT = |
9467 | EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits()); |
9468 | SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData); |
9469 | |
9470 | EVT WidenedStoreVT = EVT::getVectorVT( |
9471 | Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1); |
9472 | EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), |
9473 | BitWidth: WidenedStoreVT.getStoreSizeInBits()); |
9474 | SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData); |
9475 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt); |
9476 | } |
9477 | |
9478 | assert(isTypeLegal(StoreVT)); |
9479 | return VData; |
9480 | } |
9481 | |
9482 | SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, |
9483 | SelectionDAG &DAG) const { |
9484 | SDLoc DL(Op); |
9485 | SDValue Chain = Op.getOperand(i: 0); |
9486 | unsigned IntrinsicID = Op.getConstantOperandVal(i: 1); |
9487 | MachineFunction &MF = DAG.getMachineFunction(); |
9488 | |
9489 | switch (IntrinsicID) { |
9490 | case Intrinsic::amdgcn_exp_compr: { |
9491 | if (!Subtarget->hasCompressedExport()) { |
9492 | DiagnosticInfoUnsupported BadIntrin( |
9493 | DAG.getMachineFunction().getFunction(), |
9494 | "intrinsic not supported on subtarget" , DL.getDebugLoc()); |
9495 | DAG.getContext()->diagnose(DI: BadIntrin); |
9496 | } |
9497 | SDValue Src0 = Op.getOperand(i: 4); |
9498 | SDValue Src1 = Op.getOperand(i: 5); |
9499 | // Hack around illegal type on SI by directly selecting it. |
9500 | if (isTypeLegal(VT: Src0.getValueType())) |
9501 | return SDValue(); |
9502 | |
9503 | const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6)); |
9504 | SDValue Undef = DAG.getUNDEF(VT: MVT::f32); |
9505 | const SDValue Ops[] = { |
9506 | Op.getOperand(i: 2), // tgt |
9507 | DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0 |
9508 | DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1 |
9509 | Undef, // src2 |
9510 | Undef, // src3 |
9511 | Op.getOperand(i: 7), // vm |
9512 | DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // compr |
9513 | Op.getOperand(i: 3), // en |
9514 | Op.getOperand(i: 0) // Chain |
9515 | }; |
9516 | |
9517 | unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; |
9518 | return SDValue(DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops), 0); |
9519 | } |
9520 | case Intrinsic::amdgcn_s_barrier: { |
9521 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
9522 | if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { |
9523 | unsigned WGSize = ST.getFlatWorkGroupSizes(F: MF.getFunction()).second; |
9524 | if (WGSize <= ST.getWavefrontSize()) |
9525 | return SDValue(DAG.getMachineNode(Opcode: AMDGPU::WAVE_BARRIER, dl: DL, VT: MVT::Other, |
9526 | Op1: Op.getOperand(i: 0)), 0); |
9527 | } |
9528 | |
9529 | // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait |
9530 | if (ST.hasSplitBarriers()) { |
9531 | SDValue K = |
9532 | DAG.getTargetConstant(Val: AMDGPU::Barrier::WORKGROUP, DL, VT: MVT::i32); |
9533 | SDValue BarSignal = |
9534 | SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_SIGNAL_IMM, dl: DL, |
9535 | VT: MVT::Other, Op1: K, Op2: Op.getOperand(i: 0)), |
9536 | 0); |
9537 | SDValue BarWait = |
9538 | SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_WAIT, dl: DL, VT: MVT::Other, Op1: K, |
9539 | Op2: BarSignal.getValue(R: 0)), |
9540 | 0); |
9541 | return BarWait; |
9542 | } |
9543 | |
9544 | return SDValue(); |
9545 | }; |
9546 | |
9547 | case Intrinsic::amdgcn_struct_tbuffer_store: |
9548 | case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { |
9549 | SDValue VData = Op.getOperand(i: 2); |
9550 | bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); |
9551 | if (IsD16) |
9552 | VData = handleD16VData(VData, DAG); |
9553 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
9554 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG); |
9555 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget); |
9556 | SDValue Ops[] = { |
9557 | Chain, |
9558 | VData, // vdata |
9559 | Rsrc, // rsrc |
9560 | Op.getOperand(i: 4), // vindex |
9561 | Offsets.first, // voffset |
9562 | SOffset, // soffset |
9563 | Offsets.second, // offset |
9564 | Op.getOperand(i: 7), // format |
9565 | Op.getOperand(i: 8), // cachepolicy, swizzled buffer |
9566 | DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen |
9567 | }; |
9568 | unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : |
9569 | AMDGPUISD::TBUFFER_STORE_FORMAT; |
9570 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9571 | return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops, |
9572 | MemVT: M->getMemoryVT(), MMO: M->getMemOperand()); |
9573 | } |
9574 | |
9575 | case Intrinsic::amdgcn_raw_tbuffer_store: |
9576 | case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { |
9577 | SDValue VData = Op.getOperand(i: 2); |
9578 | bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); |
9579 | if (IsD16) |
9580 | VData = handleD16VData(VData, DAG); |
9581 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
9582 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
9583 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
9584 | SDValue Ops[] = { |
9585 | Chain, |
9586 | VData, // vdata |
9587 | Rsrc, // rsrc |
9588 | DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex |
9589 | Offsets.first, // voffset |
9590 | SOffset, // soffset |
9591 | Offsets.second, // offset |
9592 | Op.getOperand(i: 6), // format |
9593 | Op.getOperand(i: 7), // cachepolicy, swizzled buffer |
9594 | DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen |
9595 | }; |
9596 | unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : |
9597 | AMDGPUISD::TBUFFER_STORE_FORMAT; |
9598 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9599 | return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops, |
9600 | MemVT: M->getMemoryVT(), MMO: M->getMemOperand()); |
9601 | } |
9602 | |
9603 | case Intrinsic::amdgcn_raw_buffer_store: |
9604 | case Intrinsic::amdgcn_raw_ptr_buffer_store: |
9605 | case Intrinsic::amdgcn_raw_buffer_store_format: |
9606 | case Intrinsic::amdgcn_raw_ptr_buffer_store_format: { |
9607 | const bool IsFormat = |
9608 | IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format || |
9609 | IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format; |
9610 | |
9611 | SDValue VData = Op.getOperand(i: 2); |
9612 | EVT VDataVT = VData.getValueType(); |
9613 | EVT EltType = VDataVT.getScalarType(); |
9614 | bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); |
9615 | if (IsD16) { |
9616 | VData = handleD16VData(VData, DAG); |
9617 | VDataVT = VData.getValueType(); |
9618 | } |
9619 | |
9620 | if (!isTypeLegal(VT: VDataVT)) { |
9621 | VData = |
9622 | DAG.getNode(Opcode: ISD::BITCAST, DL, |
9623 | VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData); |
9624 | } |
9625 | |
9626 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
9627 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG); |
9628 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget); |
9629 | SDValue Ops[] = { |
9630 | Chain, |
9631 | VData, |
9632 | Rsrc, |
9633 | DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex |
9634 | Offsets.first, // voffset |
9635 | SOffset, // soffset |
9636 | Offsets.second, // offset |
9637 | Op.getOperand(i: 6), // cachepolicy, swizzled buffer |
9638 | DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen |
9639 | }; |
9640 | unsigned Opc = |
9641 | IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; |
9642 | Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; |
9643 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9644 | |
9645 | // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics |
9646 | if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) |
9647 | return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M); |
9648 | |
9649 | return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops, |
9650 | MemVT: M->getMemoryVT(), MMO: M->getMemOperand()); |
9651 | } |
9652 | |
9653 | case Intrinsic::amdgcn_struct_buffer_store: |
9654 | case Intrinsic::amdgcn_struct_ptr_buffer_store: |
9655 | case Intrinsic::amdgcn_struct_buffer_store_format: |
9656 | case Intrinsic::amdgcn_struct_ptr_buffer_store_format: { |
9657 | const bool IsFormat = |
9658 | IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format || |
9659 | IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format; |
9660 | |
9661 | SDValue VData = Op.getOperand(i: 2); |
9662 | EVT VDataVT = VData.getValueType(); |
9663 | EVT EltType = VDataVT.getScalarType(); |
9664 | bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); |
9665 | |
9666 | if (IsD16) { |
9667 | VData = handleD16VData(VData, DAG); |
9668 | VDataVT = VData.getValueType(); |
9669 | } |
9670 | |
9671 | if (!isTypeLegal(VT: VDataVT)) { |
9672 | VData = |
9673 | DAG.getNode(Opcode: ISD::BITCAST, DL, |
9674 | VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData); |
9675 | } |
9676 | |
9677 | auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG); |
9678 | auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG); |
9679 | auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget); |
9680 | SDValue Ops[] = { |
9681 | Chain, |
9682 | VData, |
9683 | Rsrc, |
9684 | Op.getOperand(i: 4), // vindex |
9685 | Offsets.first, // voffset |
9686 | SOffset, // soffset |
9687 | Offsets.second, // offset |
9688 | Op.getOperand(i: 7), // cachepolicy, swizzled buffer |
9689 | DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen |
9690 | }; |
9691 | unsigned Opc = |
9692 | !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; |
9693 | Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; |
9694 | MemSDNode *M = cast<MemSDNode>(Val&: Op); |
9695 | |
9696 | // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics |
9697 | EVT VDataType = VData.getValueType().getScalarType(); |
9698 | if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) |
9699 | return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); |
9700 | |
9701 | return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops, |
9702 | MemVT: M->getMemoryVT(), MMO: M->getMemOperand()); |
9703 | } |
9704 | case Intrinsic::amdgcn_raw_buffer_load_lds: |
9705 | case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: |
9706 | case Intrinsic::amdgcn_struct_buffer_load_lds: |
9707 | case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { |
9708 | assert(!AMDGPU::isGFX12Plus(*Subtarget)); |
9709 | unsigned Opc; |
9710 | bool HasVIndex = |
9711 | IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || |
9712 | IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; |
9713 | unsigned OpOffset = HasVIndex ? 1 : 0; |
9714 | SDValue VOffset = Op.getOperand(i: 5 + OpOffset); |
9715 | bool HasVOffset = !isNullConstant(V: VOffset); |
9716 | unsigned Size = Op->getConstantOperandVal(Num: 4); |
9717 | |
9718 | switch (Size) { |
9719 | default: |
9720 | return SDValue(); |
9721 | case 1: |
9722 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN |
9723 | : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN |
9724 | : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN |
9725 | : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; |
9726 | break; |
9727 | case 2: |
9728 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN |
9729 | : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN |
9730 | : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN |
9731 | : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; |
9732 | break; |
9733 | case 4: |
9734 | Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN |
9735 | : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN |
9736 | : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN |
9737 | : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; |
9738 | break; |
9739 | } |
9740 | |
9741 | SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3)); |
9742 | |
9743 | SmallVector<SDValue, 8> Ops; |
9744 | |
9745 | if (HasVIndex && HasVOffset) |
9746 | Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL, |
9747 | Ops: { Op.getOperand(i: 5), // VIndex |
9748 | VOffset })); |
9749 | else if (HasVIndex) |
9750 | Ops.push_back(Elt: Op.getOperand(i: 5)); |
9751 | else if (HasVOffset) |
9752 | Ops.push_back(Elt: VOffset); |
9753 | |
9754 | SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG); |
9755 | Ops.push_back(Elt: Rsrc); |
9756 | Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset |
9757 | Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset |
9758 | unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset); |
9759 | Ops.push_back( |
9760 | Elt: DAG.getTargetConstant(Val: Aux & AMDGPU::CPol::ALL, DL, VT: MVT::i8)); // cpol |
9761 | Ops.push_back(Elt: DAG.getTargetConstant( |
9762 | Val: Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, VT: MVT::i8)); // swz |
9763 | Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain |
9764 | Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue |
9765 | |
9766 | auto *M = cast<MemSDNode>(Val&: Op); |
9767 | MachineMemOperand *LoadMMO = M->getMemOperand(); |
9768 | // Don't set the offset value here because the pointer points to the base of |
9769 | // the buffer. |
9770 | MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); |
9771 | |
9772 | MachinePointerInfo StorePtrI = LoadPtrI; |
9773 | LoadPtrI.V = PoisonValue::get( |
9774 | T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS)); |
9775 | LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; |
9776 | StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; |
9777 | |
9778 | auto F = LoadMMO->getFlags() & |
9779 | ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); |
9780 | LoadMMO = |
9781 | MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size, |
9782 | BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo()); |
9783 | |
9784 | MachineMemOperand *StoreMMO = MF.getMachineMemOperand( |
9785 | PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t), |
9786 | BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo()); |
9787 | |
9788 | auto Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops); |
9789 | DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO}); |
9790 | |
9791 | return SDValue(Load, 0); |
9792 | } |
9793 | case Intrinsic::amdgcn_global_load_lds: { |
9794 | unsigned Opc; |
9795 | unsigned Size = Op->getConstantOperandVal(Num: 4); |
9796 | switch (Size) { |
9797 | default: |
9798 | return SDValue(); |
9799 | case 1: |
9800 | Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; |
9801 | break; |
9802 | case 2: |
9803 | Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; |
9804 | break; |
9805 | case 4: |
9806 | Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; |
9807 | break; |
9808 | } |
9809 | |
9810 | auto *M = cast<MemSDNode>(Val&: Op); |
9811 | SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3)); |
9812 | |
9813 | SmallVector<SDValue, 6> Ops; |
9814 | |
9815 | SDValue Addr = Op.getOperand(i: 2); // Global ptr |
9816 | SDValue VOffset; |
9817 | // Try to split SAddr and VOffset. Global and LDS pointers share the same |
9818 | // immediate offset, so we cannot use a regular SelectGlobalSAddr(). |
9819 | if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) { |
9820 | SDValue LHS = Addr.getOperand(i: 0); |
9821 | SDValue RHS = Addr.getOperand(i: 1); |
9822 | |
9823 | if (LHS->isDivergent()) |
9824 | std::swap(a&: LHS, b&: RHS); |
9825 | |
9826 | if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND && |
9827 | RHS.getOperand(i: 0).getValueType() == MVT::i32) { |
9828 | // add (i64 sgpr), (zero_extend (i32 vgpr)) |
9829 | Addr = LHS; |
9830 | VOffset = RHS.getOperand(i: 0); |
9831 | } |
9832 | } |
9833 | |
9834 | Ops.push_back(Elt: Addr); |
9835 | if (!Addr->isDivergent()) { |
9836 | Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc); |
9837 | if (!VOffset) |
9838 | VOffset = SDValue( |
9839 | DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32, |
9840 | Op1: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)), 0); |
9841 | Ops.push_back(Elt: VOffset); |
9842 | } |
9843 | |
9844 | Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset |
9845 | Ops.push_back(Elt: Op.getOperand(i: 6)); // CPol |
9846 | Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain |
9847 | Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue |
9848 | |
9849 | MachineMemOperand *LoadMMO = M->getMemOperand(); |
9850 | MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); |
9851 | LoadPtrI.Offset = Op->getConstantOperandVal(Num: 5); |
9852 | MachinePointerInfo StorePtrI = LoadPtrI; |
9853 | LoadPtrI.V = PoisonValue::get( |
9854 | T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS)); |
9855 | LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; |
9856 | StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; |
9857 | auto F = LoadMMO->getFlags() & |
9858 | ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); |
9859 | LoadMMO = |
9860 | MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size, |
9861 | BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo()); |
9862 | MachineMemOperand *StoreMMO = MF.getMachineMemOperand( |
9863 | PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t), BaseAlignment: Align(4), |
9864 | AAInfo: LoadMMO->getAAInfo()); |
9865 | |
9866 | auto Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops); |
9867 | DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO}); |
9868 | |
9869 | return SDValue(Load, 0); |
9870 | } |
9871 | case Intrinsic::amdgcn_end_cf: |
9872 | return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other, |
9873 | Op1: Op->getOperand(Num: 2), Op2: Chain), 0); |
9874 | case Intrinsic::amdgcn_s_barrier_init: |
9875 | case Intrinsic::amdgcn_s_barrier_join: |
9876 | case Intrinsic::amdgcn_s_wakeup_barrier: { |
9877 | SDValue Chain = Op->getOperand(Num: 0); |
9878 | SmallVector<SDValue, 2> Ops; |
9879 | SDValue BarOp = Op->getOperand(Num: 2); |
9880 | unsigned Opc; |
9881 | bool IsInlinableBarID = false; |
9882 | int64_t BarVal; |
9883 | |
9884 | if (isa<ConstantSDNode>(Val: BarOp)) { |
9885 | BarVal = cast<ConstantSDNode>(Val&: BarOp)->getSExtValue(); |
9886 | IsInlinableBarID = AMDGPU::isInlinableIntLiteral(Literal: BarVal); |
9887 | } |
9888 | |
9889 | if (IsInlinableBarID) { |
9890 | switch (IntrinsicID) { |
9891 | default: |
9892 | return SDValue(); |
9893 | case Intrinsic::amdgcn_s_barrier_init: |
9894 | Opc = AMDGPU::S_BARRIER_INIT_IMM; |
9895 | break; |
9896 | case Intrinsic::amdgcn_s_barrier_join: |
9897 | Opc = AMDGPU::S_BARRIER_JOIN_IMM; |
9898 | break; |
9899 | case Intrinsic::amdgcn_s_wakeup_barrier: |
9900 | Opc = AMDGPU::S_WAKEUP_BARRIER_IMM; |
9901 | break; |
9902 | } |
9903 | |
9904 | SDValue K = DAG.getTargetConstant(Val: BarVal, DL, VT: MVT::i32); |
9905 | Ops.push_back(Elt: K); |
9906 | } else { |
9907 | switch (IntrinsicID) { |
9908 | default: |
9909 | return SDValue(); |
9910 | case Intrinsic::amdgcn_s_barrier_init: |
9911 | Opc = AMDGPU::S_BARRIER_INIT_M0; |
9912 | break; |
9913 | case Intrinsic::amdgcn_s_barrier_join: |
9914 | Opc = AMDGPU::S_BARRIER_JOIN_M0; |
9915 | break; |
9916 | case Intrinsic::amdgcn_s_wakeup_barrier: |
9917 | Opc = AMDGPU::S_WAKEUP_BARRIER_M0; |
9918 | break; |
9919 | } |
9920 | } |
9921 | |
9922 | if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) { |
9923 | SDValue M0Val; |
9924 | // Member count will be read from M0[16:22] |
9925 | M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: Op.getOperand(i: 3), |
9926 | N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL)); |
9927 | |
9928 | if (!IsInlinableBarID) { |
9929 | // If reference to barrier id is not an inline constant then it must be |
9930 | // referenced with M0[4:0]. Perform an OR with the member count to |
9931 | // include it in M0. |
9932 | M0Val = SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, |
9933 | Op1: Op.getOperand(i: 2), Op2: M0Val), |
9934 | 0); |
9935 | } |
9936 | Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0)); |
9937 | } else if (!IsInlinableBarID) { |
9938 | Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: BarOp).getValue(R: 0)); |
9939 | } |
9940 | |
9941 | auto NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops); |
9942 | return SDValue(NewMI, 0); |
9943 | } |
9944 | default: { |
9945 | if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = |
9946 | AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID)) |
9947 | return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true); |
9948 | |
9949 | return Op; |
9950 | } |
9951 | } |
9952 | } |
9953 | |
9954 | // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: |
9955 | // offset (the offset that is included in bounds checking and swizzling, to be |
9956 | // split between the instruction's voffset and immoffset fields) and soffset |
9957 | // (the offset that is excluded from bounds checking and swizzling, to go in |
9958 | // the instruction's soffset field). This function takes the first kind of |
9959 | // offset and figures out how to split it between voffset and immoffset. |
9960 | std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( |
9961 | SDValue Offset, SelectionDAG &DAG) const { |
9962 | SDLoc DL(Offset); |
9963 | const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget); |
9964 | SDValue N0 = Offset; |
9965 | ConstantSDNode *C1 = nullptr; |
9966 | |
9967 | if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0))) |
9968 | N0 = SDValue(); |
9969 | else if (DAG.isBaseWithConstantOffset(Op: N0)) { |
9970 | C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1)); |
9971 | N0 = N0.getOperand(i: 0); |
9972 | } |
9973 | |
9974 | if (C1) { |
9975 | unsigned ImmOffset = C1->getZExtValue(); |
9976 | // If the immediate value is too big for the immoffset field, put only bits |
9977 | // that would normally fit in the immoffset field. The remaining value that |
9978 | // is copied/added for the voffset field is a large power of 2, and it |
9979 | // stands more chance of being CSEd with the copy/add for another similar |
9980 | // load/store. |
9981 | // However, do not do that rounding down if that is a negative |
9982 | // number, as it appears to be illegal to have a negative offset in the |
9983 | // vgpr, even if adding the immediate offset makes it positive. |
9984 | unsigned Overflow = ImmOffset & ~MaxImm; |
9985 | ImmOffset -= Overflow; |
9986 | if ((int32_t)Overflow < 0) { |
9987 | Overflow += ImmOffset; |
9988 | ImmOffset = 0; |
9989 | } |
9990 | C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32)); |
9991 | if (Overflow) { |
9992 | auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32); |
9993 | if (!N0) |
9994 | N0 = OverflowVal; |
9995 | else { |
9996 | SDValue Ops[] = { N0, OverflowVal }; |
9997 | N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops); |
9998 | } |
9999 | } |
10000 | } |
10001 | if (!N0) |
10002 | N0 = DAG.getConstant(Val: 0, DL, VT: MVT::i32); |
10003 | if (!C1) |
10004 | C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)); |
10005 | return {N0, SDValue(C1, 0)}; |
10006 | } |
10007 | |
10008 | // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store |
10009 | // the three offsets (voffset, soffset and instoffset) into the SDValue[3] array |
10010 | // pointed to by Offsets. |
10011 | void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, |
10012 | SelectionDAG &DAG, SDValue *Offsets, |
10013 | Align Alignment) const { |
10014 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
10015 | SDLoc DL(CombinedOffset); |
10016 | if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) { |
10017 | uint32_t Imm = C->getZExtValue(); |
10018 | uint32_t SOffset, ImmOffset; |
10019 | if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { |
10020 | Offsets[0] = DAG.getConstant(Val: 0, DL, VT: MVT::i32); |
10021 | Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32); |
10022 | Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32); |
10023 | return; |
10024 | } |
10025 | } |
10026 | if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) { |
10027 | SDValue N0 = CombinedOffset.getOperand(i: 0); |
10028 | SDValue N1 = CombinedOffset.getOperand(i: 1); |
10029 | uint32_t SOffset, ImmOffset; |
10030 | int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue(); |
10031 | if (Offset >= 0 && |
10032 | TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) { |
10033 | Offsets[0] = N0; |
10034 | Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32); |
10035 | Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32); |
10036 | return; |
10037 | } |
10038 | } |
10039 | |
10040 | SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() |
10041 | ? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32) |
10042 | : DAG.getConstant(Val: 0, DL, VT: MVT::i32); |
10043 | |
10044 | Offsets[0] = CombinedOffset; |
10045 | Offsets[1] = SOffsetZero; |
10046 | Offsets[2] = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32); |
10047 | } |
10048 | |
10049 | SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer, |
10050 | SelectionDAG &DAG) const { |
10051 | if (!MaybePointer.getValueType().isScalarInteger()) |
10052 | return MaybePointer; |
10053 | |
10054 | SDLoc DL(MaybePointer); |
10055 | |
10056 | SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer); |
10057 | return Rsrc; |
10058 | } |
10059 | |
10060 | // Wrap a global or flat pointer into a buffer intrinsic using the flags |
10061 | // specified in the intrinsic. |
10062 | SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, |
10063 | SelectionDAG &DAG) const { |
10064 | SDLoc Loc(Op); |
10065 | |
10066 | SDValue Pointer = Op->getOperand(Num: 1); |
10067 | SDValue Stride = Op->getOperand(Num: 2); |
10068 | SDValue NumRecords = Op->getOperand(Num: 3); |
10069 | SDValue Flags = Op->getOperand(Num: 4); |
10070 | |
10071 | auto [LowHalf, HighHalf] = DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32); |
10072 | SDValue Mask = DAG.getConstant(Val: 0x0000ffff, DL: Loc, VT: MVT::i32); |
10073 | SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask); |
10074 | std::optional<uint32_t> ConstStride = std::nullopt; |
10075 | if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val&: Stride)) |
10076 | ConstStride = ConstNode->getZExtValue(); |
10077 | |
10078 | SDValue NewHighHalf = Masked; |
10079 | if (!ConstStride || *ConstStride != 0) { |
10080 | SDValue ShiftedStride; |
10081 | if (ConstStride) { |
10082 | ShiftedStride = DAG.getConstant(Val: *ConstStride << 16, DL: Loc, VT: MVT::i32); |
10083 | } else { |
10084 | SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32); |
10085 | ShiftedStride = |
10086 | DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride, |
10087 | N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL: Loc)); |
10088 | } |
10089 | NewHighHalf = DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride); |
10090 | } |
10091 | |
10092 | SDValue Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf, |
10093 | N2: NewHighHalf, N3: NumRecords, N4: Flags); |
10094 | SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc); |
10095 | return RsrcPtr; |
10096 | } |
10097 | |
10098 | // Handle 8 bit and 16 bit buffer loads |
10099 | SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, |
10100 | EVT LoadVT, SDLoc DL, |
10101 | ArrayRef<SDValue> Ops, |
10102 | MachineMemOperand *MMO, |
10103 | bool IsTFE) const { |
10104 | EVT IntVT = LoadVT.changeTypeToInteger(); |
10105 | |
10106 | if (IsTFE) { |
10107 | unsigned Opc = (LoadVT.getScalarType() == MVT::i8) |
10108 | ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE |
10109 | : AMDGPUISD::BUFFER_LOAD_USHORT_TFE; |
10110 | MachineFunction &MF = DAG.getMachineFunction(); |
10111 | MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 8); |
10112 | SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other); |
10113 | SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG); |
10114 | SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, |
10115 | N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32)); |
10116 | SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, |
10117 | N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32)); |
10118 | SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data); |
10119 | SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc); |
10120 | return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL); |
10121 | } |
10122 | |
10123 | unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? |
10124 | AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; |
10125 | |
10126 | SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other); |
10127 | SDValue BufferLoad = |
10128 | DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO); |
10129 | SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad); |
10130 | LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal); |
10131 | |
10132 | return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL); |
10133 | } |
10134 | |
10135 | // Handle 8 bit and 16 bit buffer stores |
10136 | SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, |
10137 | EVT VDataType, SDLoc DL, |
10138 | SDValue Ops[], |
10139 | MemSDNode *M) const { |
10140 | if (VDataType == MVT::f16 || VDataType == MVT::bf16) |
10141 | Ops[1] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[1]); |
10142 | |
10143 | SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[1]); |
10144 | Ops[1] = BufferStoreExt; |
10145 | unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : |
10146 | AMDGPUISD::BUFFER_STORE_SHORT; |
10147 | ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9); |
10148 | return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType, |
10149 | MMO: M->getMemOperand()); |
10150 | } |
10151 | |
10152 | static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, |
10153 | ISD::LoadExtType ExtType, SDValue Op, |
10154 | const SDLoc &SL, EVT VT) { |
10155 | if (VT.bitsLT(VT: Op.getValueType())) |
10156 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op); |
10157 | |
10158 | switch (ExtType) { |
10159 | case ISD::SEXTLOAD: |
10160 | return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op); |
10161 | case ISD::ZEXTLOAD: |
10162 | return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op); |
10163 | case ISD::EXTLOAD: |
10164 | return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op); |
10165 | case ISD::NON_EXTLOAD: |
10166 | return Op; |
10167 | } |
10168 | |
10169 | llvm_unreachable("invalid ext type" ); |
10170 | } |
10171 | |
10172 | // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads. |
10173 | // TODO: Skip this on GFX12 which does have scalar sub-dword loads. |
10174 | SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { |
10175 | SelectionDAG &DAG = DCI.DAG; |
10176 | if (Ld->getAlign() < Align(4) || Ld->isDivergent()) |
10177 | return SDValue(); |
10178 | |
10179 | // FIXME: Constant loads should all be marked invariant. |
10180 | unsigned AS = Ld->getAddressSpace(); |
10181 | if (AS != AMDGPUAS::CONSTANT_ADDRESS && |
10182 | AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT && |
10183 | (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) |
10184 | return SDValue(); |
10185 | |
10186 | // Don't do this early, since it may interfere with adjacent load merging for |
10187 | // illegal types. We can avoid losing alignment information for exotic types |
10188 | // pre-legalize. |
10189 | EVT MemVT = Ld->getMemoryVT(); |
10190 | if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) || |
10191 | MemVT.getSizeInBits() >= 32) |
10192 | return SDValue(); |
10193 | |
10194 | SDLoc SL(Ld); |
10195 | |
10196 | assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && |
10197 | "unexpected vector extload" ); |
10198 | |
10199 | // TODO: Drop only high part of range. |
10200 | SDValue Ptr = Ld->getBasePtr(); |
10201 | SDValue NewLoad = DAG.getLoad( |
10202 | AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr, |
10203 | Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(), |
10204 | MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(), |
10205 | Ranges: nullptr); // Drop ranges |
10206 | |
10207 | EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits()); |
10208 | if (MemVT.isFloatingPoint()) { |
10209 | assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && |
10210 | "unexpected fp extload" ); |
10211 | TruncVT = MemVT.changeTypeToInteger(); |
10212 | } |
10213 | |
10214 | SDValue Cvt = NewLoad; |
10215 | if (Ld->getExtensionType() == ISD::SEXTLOAD) { |
10216 | Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad, |
10217 | N2: DAG.getValueType(TruncVT)); |
10218 | } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || |
10219 | Ld->getExtensionType() == ISD::NON_EXTLOAD) { |
10220 | Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT); |
10221 | } else { |
10222 | assert(Ld->getExtensionType() == ISD::EXTLOAD); |
10223 | } |
10224 | |
10225 | EVT VT = Ld->getValueType(ResNo: 0); |
10226 | EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits()); |
10227 | |
10228 | DCI.AddToWorklist(N: Cvt.getNode()); |
10229 | |
10230 | // We may need to handle exotic cases, such as i16->i64 extloads, so insert |
10231 | // the appropriate extension from the 32-bit load. |
10232 | Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT); |
10233 | DCI.AddToWorklist(N: Cvt.getNode()); |
10234 | |
10235 | // Handle conversion back to floating point if necessary. |
10236 | Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt); |
10237 | |
10238 | return DAG.getMergeValues(Ops: { Cvt, NewLoad.getValue(R: 1) }, dl: SL); |
10239 | } |
10240 | |
10241 | static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, |
10242 | const SIMachineFunctionInfo &Info) { |
10243 | // TODO: Should check if the address can definitely not access stack. |
10244 | if (Info.isEntryFunction()) |
10245 | return Info.getUserSGPRInfo().hasFlatScratchInit(); |
10246 | return true; |
10247 | } |
10248 | |
10249 | SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { |
10250 | SDLoc DL(Op); |
10251 | LoadSDNode *Load = cast<LoadSDNode>(Val&: Op); |
10252 | ISD::LoadExtType ExtType = Load->getExtensionType(); |
10253 | EVT MemVT = Load->getMemoryVT(); |
10254 | |
10255 | if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { |
10256 | if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16)) |
10257 | return SDValue(); |
10258 | |
10259 | // FIXME: Copied from PPC |
10260 | // First, load into 32 bits, then truncate to 1 bit. |
10261 | |
10262 | SDValue Chain = Load->getChain(); |
10263 | SDValue BasePtr = Load->getBasePtr(); |
10264 | MachineMemOperand *MMO = Load->getMemOperand(); |
10265 | |
10266 | EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; |
10267 | |
10268 | SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain, |
10269 | Ptr: BasePtr, MemVT: RealMemVT, MMO); |
10270 | |
10271 | if (!MemVT.isVector()) { |
10272 | SDValue Ops[] = { |
10273 | DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD), |
10274 | NewLD.getValue(R: 1) |
10275 | }; |
10276 | |
10277 | return DAG.getMergeValues(Ops, dl: DL); |
10278 | } |
10279 | |
10280 | SmallVector<SDValue, 3> Elts; |
10281 | for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { |
10282 | SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD, |
10283 | N2: DAG.getConstant(Val: I, DL, VT: MVT::i32)); |
10284 | |
10285 | Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt)); |
10286 | } |
10287 | |
10288 | SDValue Ops[] = { |
10289 | DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), |
10290 | NewLD.getValue(R: 1) |
10291 | }; |
10292 | |
10293 | return DAG.getMergeValues(Ops, dl: DL); |
10294 | } |
10295 | |
10296 | if (!MemVT.isVector()) |
10297 | return SDValue(); |
10298 | |
10299 | assert(Op.getValueType().getVectorElementType() == MVT::i32 && |
10300 | "Custom lowering for non-i32 vectors hasn't been implemented." ); |
10301 | |
10302 | Align Alignment = Load->getAlign(); |
10303 | unsigned AS = Load->getAddressSpace(); |
10304 | if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && |
10305 | Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { |
10306 | return SplitVectorLoad(Op, DAG); |
10307 | } |
10308 | |
10309 | MachineFunction &MF = DAG.getMachineFunction(); |
10310 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
10311 | // If there is a possibility that flat instruction access scratch memory |
10312 | // then we need to use the same legalization rules we use for private. |
10313 | if (AS == AMDGPUAS::FLAT_ADDRESS && |
10314 | !Subtarget->hasMultiDwordFlatScratchAddressing()) |
10315 | AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI) ? |
10316 | AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; |
10317 | |
10318 | unsigned NumElements = MemVT.getVectorNumElements(); |
10319 | |
10320 | if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
10321 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { |
10322 | if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) { |
10323 | if (MemVT.isPow2VectorType() || |
10324 | (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) |
10325 | return SDValue(); |
10326 | return WidenOrSplitVectorLoad(Op, DAG); |
10327 | } |
10328 | // Non-uniform loads will be selected to MUBUF instructions, so they |
10329 | // have the same legalization requirements as global and private |
10330 | // loads. |
10331 | // |
10332 | } |
10333 | |
10334 | if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
10335 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
10336 | AS == AMDGPUAS::GLOBAL_ADDRESS) { |
10337 | if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && |
10338 | Load->isSimple() && isMemOpHasNoClobberedMemOperand(N: Load) && |
10339 | Alignment >= Align(4) && NumElements < 32) { |
10340 | if (MemVT.isPow2VectorType() || |
10341 | (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) |
10342 | return SDValue(); |
10343 | return WidenOrSplitVectorLoad(Op, DAG); |
10344 | } |
10345 | // Non-uniform loads will be selected to MUBUF instructions, so they |
10346 | // have the same legalization requirements as global and private |
10347 | // loads. |
10348 | // |
10349 | } |
10350 | if (AS == AMDGPUAS::CONSTANT_ADDRESS || |
10351 | AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || |
10352 | AS == AMDGPUAS::GLOBAL_ADDRESS || |
10353 | AS == AMDGPUAS::FLAT_ADDRESS) { |
10354 | if (NumElements > 4) |
10355 | return SplitVectorLoad(Op, DAG); |
10356 | // v3 loads not supported on SI. |
10357 | if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) |
10358 | return WidenOrSplitVectorLoad(Op, DAG); |
10359 | |
10360 | // v3 and v4 loads are supported for private and global memory. |
10361 | return SDValue(); |
10362 | } |
10363 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
10364 | // Depending on the setting of the private_element_size field in the |
10365 | // resource descriptor, we can only make private accesses up to a certain |
10366 | // size. |
10367 | switch (Subtarget->getMaxPrivateElementSize()) { |
10368 | case 4: { |
10369 | SDValue Ops[2]; |
10370 | std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG); |
10371 | return DAG.getMergeValues(Ops, dl: DL); |
10372 | } |
10373 | case 8: |
10374 | if (NumElements > 2) |
10375 | return SplitVectorLoad(Op, DAG); |
10376 | return SDValue(); |
10377 | case 16: |
10378 | // Same as global/flat |
10379 | if (NumElements > 4) |
10380 | return SplitVectorLoad(Op, DAG); |
10381 | // v3 loads not supported on SI. |
10382 | if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) |
10383 | return WidenOrSplitVectorLoad(Op, DAG); |
10384 | |
10385 | return SDValue(); |
10386 | default: |
10387 | llvm_unreachable("unsupported private_element_size" ); |
10388 | } |
10389 | } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
10390 | unsigned Fast = 0; |
10391 | auto Flags = Load->getMemOperand()->getFlags(); |
10392 | if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS, |
10393 | Alignment: Load->getAlign(), Flags, IsFast: &Fast) && |
10394 | Fast > 1) |
10395 | return SDValue(); |
10396 | |
10397 | if (MemVT.isVector()) |
10398 | return SplitVectorLoad(Op, DAG); |
10399 | } |
10400 | |
10401 | if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(), |
10402 | VT: MemVT, MMO: *Load->getMemOperand())) { |
10403 | SDValue Ops[2]; |
10404 | std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: Load, DAG); |
10405 | return DAG.getMergeValues(Ops, dl: DL); |
10406 | } |
10407 | |
10408 | return SDValue(); |
10409 | } |
10410 | |
10411 | SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { |
10412 | EVT VT = Op.getValueType(); |
10413 | if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || |
10414 | VT.getSizeInBits() == 512) |
10415 | return splitTernaryVectorOp(Op, DAG); |
10416 | |
10417 | assert(VT.getSizeInBits() == 64); |
10418 | |
10419 | SDLoc DL(Op); |
10420 | SDValue Cond = Op.getOperand(i: 0); |
10421 | |
10422 | SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32); |
10423 | SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32); |
10424 | |
10425 | SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1)); |
10426 | SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 2)); |
10427 | |
10428 | SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero); |
10429 | SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero); |
10430 | |
10431 | SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1); |
10432 | |
10433 | SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One); |
10434 | SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One); |
10435 | |
10436 | SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1); |
10437 | |
10438 | SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi}); |
10439 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res); |
10440 | } |
10441 | |
10442 | // Catch division cases where we can use shortcuts with rcp and rsq |
10443 | // instructions. |
10444 | SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, |
10445 | SelectionDAG &DAG) const { |
10446 | SDLoc SL(Op); |
10447 | SDValue LHS = Op.getOperand(i: 0); |
10448 | SDValue RHS = Op.getOperand(i: 1); |
10449 | EVT VT = Op.getValueType(); |
10450 | const SDNodeFlags Flags = Op->getFlags(); |
10451 | |
10452 | bool AllowInaccurateRcp = Flags.hasApproximateFuncs() || |
10453 | DAG.getTarget().Options.UnsafeFPMath; |
10454 | |
10455 | if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) { |
10456 | // Without !fpmath accuracy information, we can't do more because we don't |
10457 | // know exactly whether rcp is accurate enough to meet !fpmath requirement. |
10458 | // f16 is always accurate enough |
10459 | if (!AllowInaccurateRcp && VT != MVT::f16) |
10460 | return SDValue(); |
10461 | |
10462 | if (CLHS->isExactlyValue(V: 1.0)) { |
10463 | // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to |
10464 | // the CI documentation has a worst case error of 1 ulp. |
10465 | // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to |
10466 | // use it as long as we aren't trying to use denormals. |
10467 | // |
10468 | // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. |
10469 | |
10470 | // 1.0 / sqrt(x) -> rsq(x) |
10471 | |
10472 | // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP |
10473 | // error seems really high at 2^29 ULP. |
10474 | // 1.0 / x -> rcp(x) |
10475 | return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS); |
10476 | } |
10477 | |
10478 | // Same as for 1.0, but expand the sign out of the constant. |
10479 | if (CLHS->isExactlyValue(V: -1.0)) { |
10480 | // -1.0 / x -> rcp (fneg x) |
10481 | SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS); |
10482 | return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS); |
10483 | } |
10484 | } |
10485 | |
10486 | // For f16 require afn or arcp. |
10487 | // For f32 require afn. |
10488 | if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) |
10489 | return SDValue(); |
10490 | |
10491 | // Turn into multiply by the reciprocal. |
10492 | // x / y -> x * (1.0 / y) |
10493 | SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS); |
10494 | return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags); |
10495 | } |
10496 | |
10497 | SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, |
10498 | SelectionDAG &DAG) const { |
10499 | SDLoc SL(Op); |
10500 | SDValue X = Op.getOperand(i: 0); |
10501 | SDValue Y = Op.getOperand(i: 1); |
10502 | EVT VT = Op.getValueType(); |
10503 | const SDNodeFlags Flags = Op->getFlags(); |
10504 | |
10505 | bool AllowInaccurateDiv = Flags.hasApproximateFuncs() || |
10506 | DAG.getTarget().Options.UnsafeFPMath; |
10507 | if (!AllowInaccurateDiv) |
10508 | return SDValue(); |
10509 | |
10510 | SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y); |
10511 | SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT); |
10512 | |
10513 | SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y); |
10514 | SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One); |
10515 | |
10516 | R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R); |
10517 | SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One); |
10518 | R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R); |
10519 | SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R); |
10520 | SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X); |
10521 | return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret); |
10522 | } |
10523 | |
10524 | static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, |
10525 | EVT VT, SDValue A, SDValue B, SDValue GlueChain, |
10526 | SDNodeFlags Flags) { |
10527 | if (GlueChain->getNumValues() <= 1) { |
10528 | return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags); |
10529 | } |
10530 | |
10531 | assert(GlueChain->getNumValues() == 3); |
10532 | |
10533 | SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue); |
10534 | switch (Opcode) { |
10535 | default: llvm_unreachable("no chain equivalent for opcode" ); |
10536 | case ISD::FMUL: |
10537 | Opcode = AMDGPUISD::FMUL_W_CHAIN; |
10538 | break; |
10539 | } |
10540 | |
10541 | return DAG.getNode(Opcode, DL: SL, VTList, |
10542 | Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)}, |
10543 | Flags); |
10544 | } |
10545 | |
10546 | static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, |
10547 | EVT VT, SDValue A, SDValue B, SDValue C, |
10548 | SDValue GlueChain, SDNodeFlags Flags) { |
10549 | if (GlueChain->getNumValues() <= 1) { |
10550 | return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags); |
10551 | } |
10552 | |
10553 | assert(GlueChain->getNumValues() == 3); |
10554 | |
10555 | SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue); |
10556 | switch (Opcode) { |
10557 | default: llvm_unreachable("no chain equivalent for opcode" ); |
10558 | case ISD::FMA: |
10559 | Opcode = AMDGPUISD::FMA_W_CHAIN; |
10560 | break; |
10561 | } |
10562 | |
10563 | return DAG.getNode(Opcode, DL: SL, VTList, |
10564 | Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)}, |
10565 | Flags); |
10566 | } |
10567 | |
10568 | SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { |
10569 | if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) |
10570 | return FastLowered; |
10571 | |
10572 | SDLoc SL(Op); |
10573 | SDValue Src0 = Op.getOperand(i: 0); |
10574 | SDValue Src1 = Op.getOperand(i: 1); |
10575 | |
10576 | SDValue CvtSrc0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0); |
10577 | SDValue CvtSrc1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1); |
10578 | |
10579 | SDValue RcpSrc1 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: CvtSrc1); |
10580 | SDValue Quot = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: CvtSrc0, N2: RcpSrc1); |
10581 | |
10582 | SDValue FPRoundFlag = DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32); |
10583 | SDValue BestQuot = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot, N2: FPRoundFlag); |
10584 | |
10585 | return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: BestQuot, N2: Src1, N3: Src0); |
10586 | } |
10587 | |
10588 | // Faster 2.5 ULP division that does not support denormals. |
10589 | SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { |
10590 | SDNodeFlags Flags = Op->getFlags(); |
10591 | SDLoc SL(Op); |
10592 | SDValue LHS = Op.getOperand(i: 1); |
10593 | SDValue RHS = Op.getOperand(i: 2); |
10594 | |
10595 | SDValue r1 = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags); |
10596 | |
10597 | const APFloat K0Val(0x1p+96f); |
10598 | const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32); |
10599 | |
10600 | const APFloat K1Val(0x1p-32f); |
10601 | const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32); |
10602 | |
10603 | const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32); |
10604 | |
10605 | EVT SetCCVT = |
10606 | getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32); |
10607 | |
10608 | SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT); |
10609 | |
10610 | SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags); |
10611 | |
10612 | r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags); |
10613 | |
10614 | // rcp does not support denormals. |
10615 | SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags); |
10616 | |
10617 | SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags); |
10618 | |
10619 | return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags); |
10620 | } |
10621 | |
10622 | // Returns immediate value for setting the F32 denorm mode when using the |
10623 | // S_DENORM_MODE instruction. |
10624 | static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, |
10625 | const SIMachineFunctionInfo *Info, |
10626 | const GCNSubtarget *ST) { |
10627 | assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE" ); |
10628 | uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue(); |
10629 | uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2); |
10630 | return DAG.getTargetConstant(Val: Mode, DL: SDLoc(), VT: MVT::i32); |
10631 | } |
10632 | |
10633 | SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { |
10634 | if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) |
10635 | return FastLowered; |
10636 | |
10637 | // The selection matcher assumes anything with a chain selecting to a |
10638 | // mayRaiseFPException machine instruction. Since we're introducing a chain |
10639 | // here, we need to explicitly report nofpexcept for the regular fdiv |
10640 | // lowering. |
10641 | SDNodeFlags Flags = Op->getFlags(); |
10642 | Flags.setNoFPExcept(true); |
10643 | |
10644 | SDLoc SL(Op); |
10645 | SDValue LHS = Op.getOperand(i: 0); |
10646 | SDValue RHS = Op.getOperand(i: 1); |
10647 | |
10648 | const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32); |
10649 | |
10650 | SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1); |
10651 | |
10652 | SDValue DenominatorScaled = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, |
10653 | Ops: {RHS, RHS, LHS}, Flags); |
10654 | SDValue NumeratorScaled = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, |
10655 | Ops: {LHS, RHS, LHS}, Flags); |
10656 | |
10657 | // Denominator is scaled to not be denormal, so using rcp is ok. |
10658 | SDValue ApproxRcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, |
10659 | Operand: DenominatorScaled, Flags); |
10660 | SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, |
10661 | Operand: DenominatorScaled, Flags); |
10662 | |
10663 | using namespace AMDGPU::Hwreg; |
10664 | const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: 4, Values: 2); |
10665 | const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32); |
10666 | |
10667 | const MachineFunction &MF = DAG.getMachineFunction(); |
10668 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
10669 | const DenormalMode DenormMode = Info->getMode().FP32Denormals; |
10670 | |
10671 | const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); |
10672 | const bool HasDynamicDenormals = |
10673 | (DenormMode.Input == DenormalMode::Dynamic) || |
10674 | (DenormMode.Output == DenormalMode::Dynamic); |
10675 | |
10676 | SDValue SavedDenormMode; |
10677 | |
10678 | if (!PreservesDenormals) { |
10679 | // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV |
10680 | // lowering. The chain dependence is insufficient, and we need glue. We do |
10681 | // not need the glue variants in a strictfp function. |
10682 | |
10683 | SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
10684 | |
10685 | SDValue Glue = DAG.getEntryNode(); |
10686 | if (HasDynamicDenormals) { |
10687 | SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL, |
10688 | VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue), |
10689 | Ops: {BitField, Glue}); |
10690 | SavedDenormMode = SDValue(GetReg, 0); |
10691 | |
10692 | Glue = DAG.getMergeValues( |
10693 | Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL); |
10694 | } |
10695 | |
10696 | SDNode *EnableDenorm; |
10697 | if (Subtarget->hasDenormModeInst()) { |
10698 | const SDValue EnableDenormValue = |
10699 | getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget); |
10700 | |
10701 | EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue, |
10702 | N2: EnableDenormValue) |
10703 | .getNode(); |
10704 | } else { |
10705 | const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, |
10706 | DL: SL, VT: MVT::i32); |
10707 | EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs, |
10708 | Ops: {EnableDenormValue, BitField, Glue}); |
10709 | } |
10710 | |
10711 | SDValue Ops[3] = { |
10712 | NegDivScale0, |
10713 | SDValue(EnableDenorm, 0), |
10714 | SDValue(EnableDenorm, 1) |
10715 | }; |
10716 | |
10717 | NegDivScale0 = DAG.getMergeValues(Ops, dl: SL); |
10718 | } |
10719 | |
10720 | SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, |
10721 | B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags); |
10722 | |
10723 | SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp, |
10724 | C: ApproxRcp, GlueChain: Fma0, Flags); |
10725 | |
10726 | SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled, |
10727 | B: Fma1, GlueChain: Fma1, Flags); |
10728 | |
10729 | SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul, |
10730 | C: NumeratorScaled, GlueChain: Mul, Flags); |
10731 | |
10732 | SDValue Fma3 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, |
10733 | A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags); |
10734 | |
10735 | SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3, |
10736 | C: NumeratorScaled, GlueChain: Fma3, Flags); |
10737 | |
10738 | if (!PreservesDenormals) { |
10739 | SDNode *DisableDenorm; |
10740 | if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { |
10741 | const SDValue DisableDenormValue = getSPDenormModeValue( |
10742 | FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget); |
10743 | |
10744 | DisableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VT: MVT::Other, |
10745 | N1: Fma4.getValue(R: 1), N2: DisableDenormValue, |
10746 | N3: Fma4.getValue(R: 2)).getNode(); |
10747 | } else { |
10748 | assert(HasDynamicDenormals == (bool)SavedDenormMode); |
10749 | const SDValue DisableDenormValue = |
10750 | HasDynamicDenormals |
10751 | ? SavedDenormMode |
10752 | : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32); |
10753 | |
10754 | DisableDenorm = DAG.getMachineNode( |
10755 | Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other, |
10756 | Ops: {DisableDenormValue, BitField, Fma4.getValue(R: 1), Fma4.getValue(R: 2)}); |
10757 | } |
10758 | |
10759 | SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, |
10760 | N1: SDValue(DisableDenorm, 0), N2: DAG.getRoot()); |
10761 | DAG.setRoot(OutputChain); |
10762 | } |
10763 | |
10764 | SDValue Scale = NumeratorScaled.getValue(R: 1); |
10765 | SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32, |
10766 | Ops: {Fma4, Fma1, Fma3, Scale}, Flags); |
10767 | |
10768 | return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags); |
10769 | } |
10770 | |
10771 | SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { |
10772 | if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG)) |
10773 | return FastLowered; |
10774 | |
10775 | SDLoc SL(Op); |
10776 | SDValue X = Op.getOperand(i: 0); |
10777 | SDValue Y = Op.getOperand(i: 1); |
10778 | |
10779 | const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64); |
10780 | |
10781 | SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1); |
10782 | |
10783 | SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X); |
10784 | |
10785 | SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0); |
10786 | |
10787 | SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0); |
10788 | |
10789 | SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One); |
10790 | |
10791 | SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp); |
10792 | |
10793 | SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One); |
10794 | |
10795 | SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X); |
10796 | |
10797 | SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1); |
10798 | SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3); |
10799 | |
10800 | SDValue Fma4 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, |
10801 | N1: NegDivScale0, N2: Mul, N3: DivScale1); |
10802 | |
10803 | SDValue Scale; |
10804 | |
10805 | if (!Subtarget->hasUsableDivScaleConditionOutput()) { |
10806 | // Workaround a hardware bug on SI where the condition output from div_scale |
10807 | // is not usable. |
10808 | |
10809 | const SDValue Hi = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32); |
10810 | |
10811 | // Figure out if the scale to use for div_fmas. |
10812 | SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X); |
10813 | SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y); |
10814 | SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0); |
10815 | SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1); |
10816 | |
10817 | SDValue NumHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi); |
10818 | SDValue DenHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi); |
10819 | |
10820 | SDValue Scale0Hi |
10821 | = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi); |
10822 | SDValue Scale1Hi |
10823 | = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi); |
10824 | |
10825 | SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ); |
10826 | SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ); |
10827 | Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen); |
10828 | } else { |
10829 | Scale = DivScale1.getValue(R: 1); |
10830 | } |
10831 | |
10832 | SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64, |
10833 | N1: Fma4, N2: Fma3, N3: Mul, N4: Scale); |
10834 | |
10835 | return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X); |
10836 | } |
10837 | |
10838 | SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { |
10839 | EVT VT = Op.getValueType(); |
10840 | |
10841 | if (VT == MVT::f32) |
10842 | return LowerFDIV32(Op, DAG); |
10843 | |
10844 | if (VT == MVT::f64) |
10845 | return LowerFDIV64(Op, DAG); |
10846 | |
10847 | if (VT == MVT::f16) |
10848 | return LowerFDIV16(Op, DAG); |
10849 | |
10850 | llvm_unreachable("Unexpected type for fdiv" ); |
10851 | } |
10852 | |
10853 | SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const { |
10854 | SDLoc dl(Op); |
10855 | SDValue Val = Op.getOperand(i: 0); |
10856 | EVT VT = Val.getValueType(); |
10857 | EVT ResultExpVT = Op->getValueType(ResNo: 1); |
10858 | EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32; |
10859 | |
10860 | SDValue Mant = DAG.getNode( |
10861 | Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT, |
10862 | N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val); |
10863 | |
10864 | SDValue Exp = DAG.getNode( |
10865 | Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT, |
10866 | N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val); |
10867 | |
10868 | if (Subtarget->hasFractBug()) { |
10869 | SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val); |
10870 | SDValue Inf = DAG.getConstantFP( |
10871 | Val: APFloat::getInf(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)), DL: dl, VT); |
10872 | |
10873 | SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT); |
10874 | SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT); |
10875 | Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero); |
10876 | Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val); |
10877 | } |
10878 | |
10879 | SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT); |
10880 | return DAG.getMergeValues(Ops: {Mant, CastExp}, dl); |
10881 | } |
10882 | |
10883 | SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { |
10884 | SDLoc DL(Op); |
10885 | StoreSDNode *Store = cast<StoreSDNode>(Val&: Op); |
10886 | EVT VT = Store->getMemoryVT(); |
10887 | |
10888 | if (VT == MVT::i1) { |
10889 | return DAG.getTruncStore(Chain: Store->getChain(), dl: DL, |
10890 | Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32), |
10891 | Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand()); |
10892 | } |
10893 | |
10894 | assert(VT.isVector() && |
10895 | Store->getValue().getValueType().getScalarType() == MVT::i32); |
10896 | |
10897 | unsigned AS = Store->getAddressSpace(); |
10898 | if (Subtarget->hasLDSMisalignedBug() && |
10899 | AS == AMDGPUAS::FLAT_ADDRESS && |
10900 | Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) { |
10901 | return SplitVectorStore(Op, DAG); |
10902 | } |
10903 | |
10904 | MachineFunction &MF = DAG.getMachineFunction(); |
10905 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
10906 | // If there is a possibility that flat instruction access scratch memory |
10907 | // then we need to use the same legalization rules we use for private. |
10908 | if (AS == AMDGPUAS::FLAT_ADDRESS && |
10909 | !Subtarget->hasMultiDwordFlatScratchAddressing()) |
10910 | AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI) ? |
10911 | AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; |
10912 | |
10913 | unsigned NumElements = VT.getVectorNumElements(); |
10914 | if (AS == AMDGPUAS::GLOBAL_ADDRESS || |
10915 | AS == AMDGPUAS::FLAT_ADDRESS) { |
10916 | if (NumElements > 4) |
10917 | return SplitVectorStore(Op, DAG); |
10918 | // v3 stores not supported on SI. |
10919 | if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) |
10920 | return SplitVectorStore(Op, DAG); |
10921 | |
10922 | if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(), |
10923 | VT, MMO: *Store->getMemOperand())) |
10924 | return expandUnalignedStore(ST: Store, DAG); |
10925 | |
10926 | return SDValue(); |
10927 | } |
10928 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) { |
10929 | switch (Subtarget->getMaxPrivateElementSize()) { |
10930 | case 4: |
10931 | return scalarizeVectorStore(ST: Store, DAG); |
10932 | case 8: |
10933 | if (NumElements > 2) |
10934 | return SplitVectorStore(Op, DAG); |
10935 | return SDValue(); |
10936 | case 16: |
10937 | if (NumElements > 4 || |
10938 | (NumElements == 3 && !Subtarget->enableFlatScratch())) |
10939 | return SplitVectorStore(Op, DAG); |
10940 | return SDValue(); |
10941 | default: |
10942 | llvm_unreachable("unsupported private_element_size" ); |
10943 | } |
10944 | } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { |
10945 | unsigned Fast = 0; |
10946 | auto Flags = Store->getMemOperand()->getFlags(); |
10947 | if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS, |
10948 | Alignment: Store->getAlign(), Flags, IsFast: &Fast) && |
10949 | Fast > 1) |
10950 | return SDValue(); |
10951 | |
10952 | if (VT.isVector()) |
10953 | return SplitVectorStore(Op, DAG); |
10954 | |
10955 | return expandUnalignedStore(ST: Store, DAG); |
10956 | } |
10957 | |
10958 | // Probably an invalid store. If so we'll end up emitting a selection error. |
10959 | return SDValue(); |
10960 | } |
10961 | |
10962 | // Avoid the full correct expansion for f32 sqrt when promoting from f16. |
10963 | SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { |
10964 | SDLoc SL(Op); |
10965 | assert(!Subtarget->has16BitInsts()); |
10966 | SDNodeFlags Flags = Op->getFlags(); |
10967 | SDValue Ext = |
10968 | DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: 0), Flags); |
10969 | |
10970 | SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32); |
10971 | SDValue Sqrt = |
10972 | DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags); |
10973 | |
10974 | return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt, |
10975 | N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags); |
10976 | } |
10977 | |
10978 | SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { |
10979 | SDLoc DL(Op); |
10980 | SDNodeFlags Flags = Op->getFlags(); |
10981 | MVT VT = Op.getValueType().getSimpleVT(); |
10982 | const SDValue X = Op.getOperand(i: 0); |
10983 | |
10984 | if (allowApproxFunc(DAG, Flags)) { |
10985 | // Instruction is 1ulp but ignores denormals. |
10986 | return DAG.getNode( |
10987 | Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, |
10988 | N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags); |
10989 | } |
10990 | |
10991 | SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT); |
10992 | SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT); |
10993 | |
10994 | SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT); |
10995 | |
10996 | SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags); |
10997 | |
10998 | SDValue SqrtX = |
10999 | DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags); |
11000 | |
11001 | SDValue SqrtS; |
11002 | if (needsDenormHandlingF32(DAG, Src: X, Flags)) { |
11003 | SDValue SqrtID = |
11004 | DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32); |
11005 | SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags); |
11006 | |
11007 | SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS); |
11008 | SDValue SqrtSNextDownInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt, |
11009 | N2: DAG.getConstant(Val: -1, DL, VT: MVT::i32)); |
11010 | SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt); |
11011 | |
11012 | SDValue NegSqrtSNextDown = |
11013 | DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags); |
11014 | |
11015 | SDValue SqrtVP = |
11016 | DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags); |
11017 | |
11018 | SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt, |
11019 | N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32)); |
11020 | SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt); |
11021 | |
11022 | SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags); |
11023 | SDValue SqrtVS = |
11024 | DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags); |
11025 | |
11026 | SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT); |
11027 | SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE); |
11028 | |
11029 | SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS, |
11030 | Flags); |
11031 | |
11032 | SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT); |
11033 | SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS, |
11034 | Flags); |
11035 | } else { |
11036 | SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags); |
11037 | |
11038 | SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags); |
11039 | |
11040 | SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT); |
11041 | SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags); |
11042 | SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags); |
11043 | |
11044 | SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags); |
11045 | SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags); |
11046 | SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags); |
11047 | |
11048 | SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags); |
11049 | SDValue SqrtD = |
11050 | DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags); |
11051 | SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags); |
11052 | } |
11053 | |
11054 | SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT); |
11055 | |
11056 | SDValue ScaledDown = |
11057 | DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags); |
11058 | |
11059 | SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags); |
11060 | SDValue IsZeroOrInf = |
11061 | DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX, |
11062 | N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32)); |
11063 | |
11064 | return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags); |
11065 | } |
11066 | |
11067 | SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { |
11068 | // For double type, the SQRT and RSQ instructions don't have required |
11069 | // precision, we apply Goldschmidt's algorithm to improve the result: |
11070 | // |
11071 | // y0 = rsq(x) |
11072 | // g0 = x * y0 |
11073 | // h0 = 0.5 * y0 |
11074 | // |
11075 | // r0 = 0.5 - h0 * g0 |
11076 | // g1 = g0 * r0 + g0 |
11077 | // h1 = h0 * r0 + h0 |
11078 | // |
11079 | // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 |
11080 | // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 |
11081 | // h2 = h1 * r1 + h1 |
11082 | // |
11083 | // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 |
11084 | // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 |
11085 | // |
11086 | // sqrt(x) = g3 |
11087 | |
11088 | SDNodeFlags Flags = Op->getFlags(); |
11089 | |
11090 | SDLoc DL(Op); |
11091 | |
11092 | SDValue X = Op.getOperand(i: 0); |
11093 | SDValue ScaleConstant = DAG.getConstantFP(Val: 0x1.0p-767, DL, VT: MVT::f64); |
11094 | |
11095 | SDValue Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT); |
11096 | |
11097 | SDValue ZeroInt = DAG.getConstant(Val: 0, DL, VT: MVT::i32); |
11098 | |
11099 | // Scale up input if it is too small. |
11100 | SDValue ScaleUpFactor = DAG.getConstant(Val: 256, DL, VT: MVT::i32); |
11101 | SDValue ScaleUp = |
11102 | DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt); |
11103 | SDValue SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags); |
11104 | |
11105 | SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX); |
11106 | |
11107 | SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY); |
11108 | |
11109 | SDValue Half = DAG.getConstantFP(Val: 0.5, DL, VT: MVT::f64); |
11110 | SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half); |
11111 | |
11112 | SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0); |
11113 | SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half); |
11114 | |
11115 | SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0); |
11116 | |
11117 | SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0); |
11118 | |
11119 | SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1); |
11120 | SDValue SqrtD0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX); |
11121 | |
11122 | SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1); |
11123 | |
11124 | SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2); |
11125 | SDValue SqrtD1 = |
11126 | DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX); |
11127 | |
11128 | SDValue SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2); |
11129 | |
11130 | SDValue ScaleDownFactor = DAG.getConstant(Val: -128, DL, VT: MVT::i32); |
11131 | SDValue ScaleDown = |
11132 | DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleDownFactor, N3: ZeroInt); |
11133 | SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags); |
11134 | |
11135 | // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check |
11136 | // with finite only or nsz because rsq(+/-0) = +/-inf |
11137 | |
11138 | // TODO: Check for DAZ and expand to subnormals |
11139 | SDValue IsZeroOrInf = |
11140 | DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX, |
11141 | N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32)); |
11142 | |
11143 | // If x is +INF, +0, or -0, use its original value |
11144 | return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet, |
11145 | Flags); |
11146 | } |
11147 | |
11148 | SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { |
11149 | SDLoc DL(Op); |
11150 | EVT VT = Op.getValueType(); |
11151 | SDValue Arg = Op.getOperand(i: 0); |
11152 | SDValue TrigVal; |
11153 | |
11154 | // Propagate fast-math flags so that the multiply we introduce can be folded |
11155 | // if Arg is already the result of a multiply by constant. |
11156 | auto Flags = Op->getFlags(); |
11157 | |
11158 | SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT); |
11159 | |
11160 | if (Subtarget->hasTrigReducedRange()) { |
11161 | SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags); |
11162 | TrigVal = DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags); |
11163 | } else { |
11164 | TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags); |
11165 | } |
11166 | |
11167 | switch (Op.getOpcode()) { |
11168 | case ISD::FCOS: |
11169 | return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags); |
11170 | case ISD::FSIN: |
11171 | return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags); |
11172 | default: |
11173 | llvm_unreachable("Wrong trig opcode" ); |
11174 | } |
11175 | } |
11176 | |
11177 | SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { |
11178 | AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op); |
11179 | assert(AtomicNode->isCompareAndSwap()); |
11180 | unsigned AS = AtomicNode->getAddressSpace(); |
11181 | |
11182 | // No custom lowering required for local address space |
11183 | if (!AMDGPU::isFlatGlobalAddrSpace(AS)) |
11184 | return Op; |
11185 | |
11186 | // Non-local address space requires custom lowering for atomic compare |
11187 | // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 |
11188 | SDLoc DL(Op); |
11189 | SDValue ChainIn = Op.getOperand(i: 0); |
11190 | SDValue Addr = Op.getOperand(i: 1); |
11191 | SDValue Old = Op.getOperand(i: 2); |
11192 | SDValue New = Op.getOperand(i: 3); |
11193 | EVT VT = Op.getValueType(); |
11194 | MVT SimpleVT = VT.getSimpleVT(); |
11195 | MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2); |
11196 | |
11197 | SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old}); |
11198 | SDValue Ops[] = { ChainIn, Addr, NewOld }; |
11199 | |
11200 | return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL, VTList: Op->getVTList(), |
11201 | Ops, MemVT: VT, MMO: AtomicNode->getMemOperand()); |
11202 | } |
11203 | |
11204 | //===----------------------------------------------------------------------===// |
11205 | // Custom DAG optimizations |
11206 | //===----------------------------------------------------------------------===// |
11207 | |
11208 | SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, |
11209 | DAGCombinerInfo &DCI) const { |
11210 | EVT VT = N->getValueType(ResNo: 0); |
11211 | EVT ScalarVT = VT.getScalarType(); |
11212 | if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16) |
11213 | return SDValue(); |
11214 | |
11215 | SelectionDAG &DAG = DCI.DAG; |
11216 | SDLoc DL(N); |
11217 | |
11218 | SDValue Src = N->getOperand(Num: 0); |
11219 | EVT SrcVT = Src.getValueType(); |
11220 | |
11221 | // TODO: We could try to match extracting the higher bytes, which would be |
11222 | // easier if i8 vectors weren't promoted to i32 vectors, particularly after |
11223 | // types are legalized. v4i8 -> v4f32 is probably the only case to worry |
11224 | // about in practice. |
11225 | if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { |
11226 | if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) { |
11227 | SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src); |
11228 | DCI.AddToWorklist(N: Cvt.getNode()); |
11229 | |
11230 | // For the f16 case, fold to a cast to f32 and then cast back to f16. |
11231 | if (ScalarVT != MVT::f32) { |
11232 | Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt, |
11233 | N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)); |
11234 | } |
11235 | return Cvt; |
11236 | } |
11237 | } |
11238 | |
11239 | return SDValue(); |
11240 | } |
11241 | |
11242 | SDValue SITargetLowering::performFCopySignCombine(SDNode *N, |
11243 | DAGCombinerInfo &DCI) const { |
11244 | SDValue MagnitudeOp = N->getOperand(Num: 0); |
11245 | SDValue SignOp = N->getOperand(Num: 1); |
11246 | SelectionDAG &DAG = DCI.DAG; |
11247 | SDLoc DL(N); |
11248 | |
11249 | // f64 fcopysign is really an f32 copysign on the high bits, so replace the |
11250 | // lower half with a copy. |
11251 | // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) |
11252 | if (MagnitudeOp.getValueType() == MVT::f64) { |
11253 | SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2f32, Operand: MagnitudeOp); |
11254 | SDValue MagLo = |
11255 | DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector, |
11256 | N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32)); |
11257 | SDValue MagHi = |
11258 | DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector, |
11259 | N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32)); |
11260 | |
11261 | SDValue HiOp = |
11262 | DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOp); |
11263 | |
11264 | SDValue Vector = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp); |
11265 | |
11266 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector); |
11267 | } |
11268 | |
11269 | if (SignOp.getValueType() != MVT::f64) |
11270 | return SDValue(); |
11271 | |
11272 | // Reduce width of sign operand, we only need the highest bit. |
11273 | // |
11274 | // fcopysign f64:x, f64:y -> |
11275 | // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) |
11276 | // TODO: In some cases it might make sense to go all the way to f16. |
11277 | SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2f32, Operand: SignOp); |
11278 | SDValue SignAsF32 = |
11279 | DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector, |
11280 | N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32)); |
11281 | |
11282 | return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0), |
11283 | N2: SignAsF32); |
11284 | } |
11285 | |
11286 | // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) |
11287 | // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no |
11288 | // bits |
11289 | |
11290 | // This is a variant of |
11291 | // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), |
11292 | // |
11293 | // The normal DAG combiner will do this, but only if the add has one use since |
11294 | // that would increase the number of instructions. |
11295 | // |
11296 | // This prevents us from seeing a constant offset that can be folded into a |
11297 | // memory instruction's addressing mode. If we know the resulting add offset of |
11298 | // a pointer can be folded into an addressing offset, we can replace the pointer |
11299 | // operand with the add of new constant offset. This eliminates one of the uses, |
11300 | // and may allow the remaining use to also be simplified. |
11301 | // |
11302 | SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, |
11303 | unsigned AddrSpace, |
11304 | EVT MemVT, |
11305 | DAGCombinerInfo &DCI) const { |
11306 | SDValue N0 = N->getOperand(Num: 0); |
11307 | SDValue N1 = N->getOperand(Num: 1); |
11308 | |
11309 | // We only do this to handle cases where it's profitable when there are |
11310 | // multiple uses of the add, so defer to the standard combine. |
11311 | if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) || |
11312 | N0->hasOneUse()) |
11313 | return SDValue(); |
11314 | |
11315 | const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1); |
11316 | if (!CN1) |
11317 | return SDValue(); |
11318 | |
11319 | const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1)); |
11320 | if (!CAdd) |
11321 | return SDValue(); |
11322 | |
11323 | SelectionDAG &DAG = DCI.DAG; |
11324 | |
11325 | if (N0->getOpcode() == ISD::OR && |
11326 | !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1))) |
11327 | return SDValue(); |
11328 | |
11329 | // If the resulting offset is too large, we can't fold it into the |
11330 | // addressing mode offset. |
11331 | APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); |
11332 | Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext()); |
11333 | |
11334 | AddrMode AM; |
11335 | AM.HasBaseReg = true; |
11336 | AM.BaseOffs = Offset.getSExtValue(); |
11337 | if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace)) |
11338 | return SDValue(); |
11339 | |
11340 | SDLoc SL(N); |
11341 | EVT VT = N->getValueType(ResNo: 0); |
11342 | |
11343 | SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1); |
11344 | SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT); |
11345 | |
11346 | SDNodeFlags Flags; |
11347 | Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && |
11348 | (N0.getOpcode() == ISD::OR || |
11349 | N0->getFlags().hasNoUnsignedWrap())); |
11350 | |
11351 | return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags); |
11352 | } |
11353 | |
11354 | /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset |
11355 | /// by the chain and intrinsic ID. Theoretically we would also need to check the |
11356 | /// specific intrinsic, but they all place the pointer operand first. |
11357 | static unsigned getBasePtrIndex(const MemSDNode *N) { |
11358 | switch (N->getOpcode()) { |
11359 | case ISD::STORE: |
11360 | case ISD::INTRINSIC_W_CHAIN: |
11361 | case ISD::INTRINSIC_VOID: |
11362 | return 2; |
11363 | default: |
11364 | return 1; |
11365 | } |
11366 | } |
11367 | |
11368 | SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, |
11369 | DAGCombinerInfo &DCI) const { |
11370 | SelectionDAG &DAG = DCI.DAG; |
11371 | SDLoc SL(N); |
11372 | |
11373 | unsigned PtrIdx = getBasePtrIndex(N); |
11374 | SDValue Ptr = N->getOperand(Num: PtrIdx); |
11375 | |
11376 | // TODO: We could also do this for multiplies. |
11377 | if (Ptr.getOpcode() == ISD::SHL) { |
11378 | SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(), |
11379 | MemVT: N->getMemoryVT(), DCI); |
11380 | if (NewPtr) { |
11381 | SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); |
11382 | |
11383 | NewOps[PtrIdx] = NewPtr; |
11384 | return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0); |
11385 | } |
11386 | } |
11387 | |
11388 | return SDValue(); |
11389 | } |
11390 | |
11391 | static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { |
11392 | return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) || |
11393 | (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) || |
11394 | (Opc == ISD::XOR && Val == 0); |
11395 | } |
11396 | |
11397 | // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This |
11398 | // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit |
11399 | // integer combine opportunities since most 64-bit operations are decomposed |
11400 | // this way. TODO: We won't want this for SALU especially if it is an inline |
11401 | // immediate. |
11402 | SDValue SITargetLowering::splitBinaryBitConstantOp( |
11403 | DAGCombinerInfo &DCI, |
11404 | const SDLoc &SL, |
11405 | unsigned Opc, SDValue LHS, |
11406 | const ConstantSDNode *CRHS) const { |
11407 | uint64_t Val = CRHS->getZExtValue(); |
11408 | uint32_t ValLo = Lo_32(Value: Val); |
11409 | uint32_t ValHi = Hi_32(Value: Val); |
11410 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
11411 | |
11412 | if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) || |
11413 | bitOpWithConstantIsReducible(Opc, Val: ValHi)) || |
11414 | (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) { |
11415 | // If we need to materialize a 64-bit immediate, it will be split up later |
11416 | // anyway. Avoid creating the harder to understand 64-bit immediate |
11417 | // materialization. |
11418 | return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi); |
11419 | } |
11420 | |
11421 | return SDValue(); |
11422 | } |
11423 | |
11424 | bool llvm::isBoolSGPR(SDValue V) { |
11425 | if (V.getValueType() != MVT::i1) |
11426 | return false; |
11427 | switch (V.getOpcode()) { |
11428 | default: |
11429 | break; |
11430 | case ISD::SETCC: |
11431 | case AMDGPUISD::FP_CLASS: |
11432 | return true; |
11433 | case ISD::AND: |
11434 | case ISD::OR: |
11435 | case ISD::XOR: |
11436 | return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1)); |
11437 | } |
11438 | return false; |
11439 | } |
11440 | |
11441 | // If a constant has all zeroes or all ones within each byte return it. |
11442 | // Otherwise return 0. |
11443 | static uint32_t getConstantPermuteMask(uint32_t C) { |
11444 | // 0xff for any zero byte in the mask |
11445 | uint32_t ZeroByteMask = 0; |
11446 | if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff; |
11447 | if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00; |
11448 | if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000; |
11449 | if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000; |
11450 | uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte |
11451 | if ((NonZeroByteMask & C) != NonZeroByteMask) |
11452 | return 0; // Partial bytes selected. |
11453 | return C; |
11454 | } |
11455 | |
11456 | // Check if a node selects whole bytes from its operand 0 starting at a byte |
11457 | // boundary while masking the rest. Returns select mask as in the v_perm_b32 |
11458 | // or -1 if not succeeded. |
11459 | // Note byte select encoding: |
11460 | // value 0-3 selects corresponding source byte; |
11461 | // value 0xc selects zero; |
11462 | // value 0xff selects 0xff. |
11463 | static uint32_t getPermuteMask(SDValue V) { |
11464 | assert(V.getValueSizeInBits() == 32); |
11465 | |
11466 | if (V.getNumOperands() != 2) |
11467 | return ~0; |
11468 | |
11469 | ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1)); |
11470 | if (!N1) |
11471 | return ~0; |
11472 | |
11473 | uint32_t C = N1->getZExtValue(); |
11474 | |
11475 | switch (V.getOpcode()) { |
11476 | default: |
11477 | break; |
11478 | case ISD::AND: |
11479 | if (uint32_t ConstMask = getConstantPermuteMask(C)) |
11480 | return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); |
11481 | break; |
11482 | |
11483 | case ISD::OR: |
11484 | if (uint32_t ConstMask = getConstantPermuteMask(C)) |
11485 | return (0x03020100 & ~ConstMask) | ConstMask; |
11486 | break; |
11487 | |
11488 | case ISD::SHL: |
11489 | if (C % 8) |
11490 | return ~0; |
11491 | |
11492 | return uint32_t((0x030201000c0c0c0cull << C) >> 32); |
11493 | |
11494 | case ISD::SRL: |
11495 | if (C % 8) |
11496 | return ~0; |
11497 | |
11498 | return uint32_t(0x0c0c0c0c03020100ull >> C); |
11499 | } |
11500 | |
11501 | return ~0; |
11502 | } |
11503 | |
11504 | SDValue SITargetLowering::performAndCombine(SDNode *N, |
11505 | DAGCombinerInfo &DCI) const { |
11506 | if (DCI.isBeforeLegalize()) |
11507 | return SDValue(); |
11508 | |
11509 | SelectionDAG &DAG = DCI.DAG; |
11510 | EVT VT = N->getValueType(ResNo: 0); |
11511 | SDValue LHS = N->getOperand(Num: 0); |
11512 | SDValue RHS = N->getOperand(Num: 1); |
11513 | |
11514 | |
11515 | const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS); |
11516 | if (VT == MVT::i64 && CRHS) { |
11517 | if (SDValue Split |
11518 | = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS)) |
11519 | return Split; |
11520 | } |
11521 | |
11522 | if (CRHS && VT == MVT::i32) { |
11523 | // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb |
11524 | // nb = number of trailing zeroes in mask |
11525 | // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, |
11526 | // given that we are selecting 8 or 16 bit fields starting at byte boundary. |
11527 | uint64_t Mask = CRHS->getZExtValue(); |
11528 | unsigned Bits = llvm::popcount(Value: Mask); |
11529 | if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && |
11530 | (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) { |
11531 | if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) { |
11532 | unsigned Shift = CShift->getZExtValue(); |
11533 | unsigned NB = CRHS->getAPIntValue().countr_zero(); |
11534 | unsigned Offset = NB + Shift; |
11535 | if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. |
11536 | SDLoc SL(N); |
11537 | SDValue BFE = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32, |
11538 | N1: LHS->getOperand(Num: 0), |
11539 | N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32), |
11540 | N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32)); |
11541 | EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits); |
11542 | SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE, |
11543 | N2: DAG.getValueType(NarrowVT)); |
11544 | SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc(LHS), VT, N1: Ext, |
11545 | N2: DAG.getConstant(Val: NB, DL: SDLoc(CRHS), VT: MVT::i32)); |
11546 | return Shl; |
11547 | } |
11548 | } |
11549 | } |
11550 | |
11551 | // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) |
11552 | if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM && |
11553 | isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) { |
11554 | uint32_t Sel = getConstantPermuteMask(C: Mask); |
11555 | if (!Sel) |
11556 | return SDValue(); |
11557 | |
11558 | // Select 0xc for all zero bytes |
11559 | Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c); |
11560 | SDLoc DL(N); |
11561 | return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0), |
11562 | N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32)); |
11563 | } |
11564 | } |
11565 | |
11566 | // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> |
11567 | // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) |
11568 | if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) { |
11569 | ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get(); |
11570 | ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get(); |
11571 | |
11572 | SDValue X = LHS.getOperand(i: 0); |
11573 | SDValue Y = RHS.getOperand(i: 0); |
11574 | if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X || |
11575 | !isTypeLegal(VT: X.getValueType())) |
11576 | return SDValue(); |
11577 | |
11578 | if (LCC == ISD::SETO) { |
11579 | if (X != LHS.getOperand(i: 1)) |
11580 | return SDValue(); |
11581 | |
11582 | if (RCC == ISD::SETUNE) { |
11583 | const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1)); |
11584 | if (!C1 || !C1->isInfinity() || C1->isNegative()) |
11585 | return SDValue(); |
11586 | |
11587 | const uint32_t Mask = SIInstrFlags::N_NORMAL | |
11588 | SIInstrFlags::N_SUBNORMAL | |
11589 | SIInstrFlags::N_ZERO | |
11590 | SIInstrFlags::P_ZERO | |
11591 | SIInstrFlags::P_SUBNORMAL | |
11592 | SIInstrFlags::P_NORMAL; |
11593 | |
11594 | static_assert(((~(SIInstrFlags::S_NAN | |
11595 | SIInstrFlags::Q_NAN | |
11596 | SIInstrFlags::N_INFINITY | |
11597 | SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, |
11598 | "mask not equal" ); |
11599 | |
11600 | SDLoc DL(N); |
11601 | return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, |
11602 | N1: X, N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32)); |
11603 | } |
11604 | } |
11605 | } |
11606 | |
11607 | if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS) |
11608 | std::swap(a&: LHS, b&: RHS); |
11609 | |
11610 | if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS && |
11611 | RHS.hasOneUse()) { |
11612 | ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get(); |
11613 | // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan) |
11614 | // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan) |
11615 | const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1)); |
11616 | if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask && |
11617 | (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) && |
11618 | LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) { |
11619 | const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN; |
11620 | unsigned NewMask = LCC == ISD::SETO ? |
11621 | Mask->getZExtValue() & ~OrdMask : |
11622 | Mask->getZExtValue() & OrdMask; |
11623 | |
11624 | SDLoc DL(N); |
11625 | return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: 0), |
11626 | N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32)); |
11627 | } |
11628 | } |
11629 | |
11630 | if (VT == MVT::i32 && |
11631 | (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { |
11632 | // and x, (sext cc from i1) => select cc, x, 0 |
11633 | if (RHS.getOpcode() != ISD::SIGN_EXTEND) |
11634 | std::swap(a&: LHS, b&: RHS); |
11635 | if (isBoolSGPR(V: RHS.getOperand(i: 0))) |
11636 | return DAG.getSelect(DL: SDLoc(N), VT: MVT::i32, Cond: RHS.getOperand(i: 0), |
11637 | LHS, RHS: DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32)); |
11638 | } |
11639 | |
11640 | // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) |
11641 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
11642 | if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && |
11643 | N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) { |
11644 | uint32_t LHSMask = getPermuteMask(V: LHS); |
11645 | uint32_t RHSMask = getPermuteMask(V: RHS); |
11646 | if (LHSMask != ~0u && RHSMask != ~0u) { |
11647 | // Canonicalize the expression in an attempt to have fewer unique masks |
11648 | // and therefore fewer registers used to hold the masks. |
11649 | if (LHSMask > RHSMask) { |
11650 | std::swap(a&: LHSMask, b&: RHSMask); |
11651 | std::swap(a&: LHS, b&: RHS); |
11652 | } |
11653 | |
11654 | // Select 0xc for each lane used from source operand. Zero has 0xc mask |
11655 | // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. |
11656 | uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; |
11657 | uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; |
11658 | |
11659 | // Check of we need to combine values from two sources within a byte. |
11660 | if (!(LHSUsedLanes & RHSUsedLanes) && |
11661 | // If we select high and lower word keep it for SDWA. |
11662 | // TODO: teach SDWA to work with v_perm_b32 and remove the check. |
11663 | !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { |
11664 | // Each byte in each mask is either selector mask 0-3, or has higher |
11665 | // bits set in either of masks, which can be 0xff for 0xff or 0x0c for |
11666 | // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise |
11667 | // mask which is not 0xff wins. By anding both masks we have a correct |
11668 | // result except that 0x0c shall be corrected to give 0x0c only. |
11669 | uint32_t Mask = LHSMask & RHSMask; |
11670 | for (unsigned I = 0; I < 32; I += 8) { |
11671 | uint32_t ByteSel = 0xff << I; |
11672 | if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c) |
11673 | Mask &= (0x0c << I) & 0xffffffff; |
11674 | } |
11675 | |
11676 | // Add 4 to each active LHS lane. It will not affect any existing 0xff |
11677 | // or 0x0c. |
11678 | uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); |
11679 | SDLoc DL(N); |
11680 | |
11681 | return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, |
11682 | N1: LHS.getOperand(i: 0), N2: RHS.getOperand(i: 0), |
11683 | N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32)); |
11684 | } |
11685 | } |
11686 | } |
11687 | |
11688 | return SDValue(); |
11689 | } |
11690 | |
11691 | // A key component of v_perm is a mapping between byte position of the src |
11692 | // operands, and the byte position of the dest. To provide such, we need: 1. the |
11693 | // node that provides x byte of the dest of the OR, and 2. the byte of the node |
11694 | // used to provide that x byte. calculateByteProvider finds which node provides |
11695 | // a certain byte of the dest of the OR, and calculateSrcByte takes that node, |
11696 | // and finds an ultimate src and byte position For example: The supported |
11697 | // LoadCombine pattern for vector loads is as follows |
11698 | // t1 |
11699 | // or |
11700 | // / \ |
11701 | // t2 t3 |
11702 | // zext shl |
11703 | // | | \ |
11704 | // t4 t5 16 |
11705 | // or anyext |
11706 | // / \ | |
11707 | // t6 t7 t8 |
11708 | // srl shl or |
11709 | // / | / \ / \ |
11710 | // t9 t10 t11 t12 t13 t14 |
11711 | // trunc* 8 trunc* 8 and and |
11712 | // | | / | | \ |
11713 | // t15 t16 t17 t18 t19 t20 |
11714 | // trunc* 255 srl -256 |
11715 | // | / \ |
11716 | // t15 t15 16 |
11717 | // |
11718 | // *In this example, the truncs are from i32->i16 |
11719 | // |
11720 | // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 |
11721 | // respectively. calculateSrcByte would find (given node) -> ultimate src & |
11722 | // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. |
11723 | // After finding the mapping, we can combine the tree into vperm t15, t16, |
11724 | // 0x05000407 |
11725 | |
11726 | // Find the source and byte position from a node. |
11727 | // \p DestByte is the byte position of the dest of the or that the src |
11728 | // ultimately provides. \p SrcIndex is the byte of the src that maps to this |
11729 | // dest of the or byte. \p Depth tracks how many recursive iterations we have |
11730 | // performed. |
11731 | static const std::optional<ByteProvider<SDValue>> |
11732 | calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, |
11733 | unsigned Depth = 0) { |
11734 | // We may need to recursively traverse a series of SRLs |
11735 | if (Depth >= 6) |
11736 | return std::nullopt; |
11737 | |
11738 | if (Op.getValueSizeInBits() < 8) |
11739 | return std::nullopt; |
11740 | |
11741 | if (Op.getValueType().isVector()) |
11742 | return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex); |
11743 | |
11744 | switch (Op->getOpcode()) { |
11745 | case ISD::TRUNCATE: { |
11746 | return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1); |
11747 | } |
11748 | |
11749 | case ISD::SIGN_EXTEND: |
11750 | case ISD::ZERO_EXTEND: |
11751 | case ISD::SIGN_EXTEND_INREG: { |
11752 | SDValue NarrowOp = Op->getOperand(Num: 0); |
11753 | auto NarrowVT = NarrowOp.getValueType(); |
11754 | if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { |
11755 | auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1)); |
11756 | NarrowVT = VTSign->getVT(); |
11757 | } |
11758 | if (!NarrowVT.isByteSized()) |
11759 | return std::nullopt; |
11760 | uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); |
11761 | |
11762 | if (SrcIndex >= NarrowByteWidth) |
11763 | return std::nullopt; |
11764 | return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1); |
11765 | } |
11766 | |
11767 | case ISD::SRA: |
11768 | case ISD::SRL: { |
11769 | auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11770 | if (!ShiftOp) |
11771 | return std::nullopt; |
11772 | |
11773 | uint64_t BitShift = ShiftOp->getZExtValue(); |
11774 | |
11775 | if (BitShift % 8 != 0) |
11776 | return std::nullopt; |
11777 | |
11778 | SrcIndex += BitShift / 8; |
11779 | |
11780 | return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1); |
11781 | } |
11782 | |
11783 | default: { |
11784 | return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex); |
11785 | } |
11786 | } |
11787 | llvm_unreachable("fully handled switch" ); |
11788 | } |
11789 | |
11790 | // For a byte position in the result of an Or, traverse the tree and find the |
11791 | // node (and the byte of the node) which ultimately provides this {Or, |
11792 | // BytePosition}. \p Op is the operand we are currently examining. \p Index is |
11793 | // the byte position of the Op that corresponds with the originally requested |
11794 | // byte of the Or \p Depth tracks how many recursive iterations we have |
11795 | // performed. \p StartingIndex is the originally requested byte of the Or |
11796 | static const std::optional<ByteProvider<SDValue>> |
11797 | calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, |
11798 | unsigned StartingIndex = 0) { |
11799 | // Finding Src tree of RHS of or typically requires at least 1 additional |
11800 | // depth |
11801 | if (Depth > 6) |
11802 | return std::nullopt; |
11803 | |
11804 | unsigned BitWidth = Op.getScalarValueSizeInBits(); |
11805 | if (BitWidth % 8 != 0) |
11806 | return std::nullopt; |
11807 | if (Index > BitWidth / 8 - 1) |
11808 | return std::nullopt; |
11809 | |
11810 | bool IsVec = Op.getValueType().isVector(); |
11811 | switch (Op.getOpcode()) { |
11812 | case ISD::OR: { |
11813 | if (IsVec) |
11814 | return std::nullopt; |
11815 | |
11816 | auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1, |
11817 | StartingIndex); |
11818 | if (!RHS) |
11819 | return std::nullopt; |
11820 | auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1, |
11821 | StartingIndex); |
11822 | if (!LHS) |
11823 | return std::nullopt; |
11824 | // A well formed Or will have two ByteProviders for each byte, one of which |
11825 | // is constant zero |
11826 | if (!LHS->isConstantZero() && !RHS->isConstantZero()) |
11827 | return std::nullopt; |
11828 | if (!LHS || LHS->isConstantZero()) |
11829 | return RHS; |
11830 | if (!RHS || RHS->isConstantZero()) |
11831 | return LHS; |
11832 | return std::nullopt; |
11833 | } |
11834 | |
11835 | case ISD::AND: { |
11836 | if (IsVec) |
11837 | return std::nullopt; |
11838 | |
11839 | auto BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11840 | if (!BitMaskOp) |
11841 | return std::nullopt; |
11842 | |
11843 | uint32_t BitMask = BitMaskOp->getZExtValue(); |
11844 | // Bits we expect for our StartingIndex |
11845 | uint32_t IndexMask = 0xFF << (Index * 8); |
11846 | |
11847 | if ((IndexMask & BitMask) != IndexMask) { |
11848 | // If the result of the and partially provides the byte, then it |
11849 | // is not well formatted |
11850 | if (IndexMask & BitMask) |
11851 | return std::nullopt; |
11852 | return ByteProvider<SDValue>::getConstantZero(); |
11853 | } |
11854 | |
11855 | return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index); |
11856 | } |
11857 | |
11858 | case ISD::FSHR: { |
11859 | if (IsVec) |
11860 | return std::nullopt; |
11861 | |
11862 | // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) |
11863 | auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2)); |
11864 | if (!ShiftOp || Op.getValueType().isVector()) |
11865 | return std::nullopt; |
11866 | |
11867 | uint64_t BitsProvided = Op.getValueSizeInBits(); |
11868 | if (BitsProvided % 8 != 0) |
11869 | return std::nullopt; |
11870 | |
11871 | uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided); |
11872 | if (BitShift % 8) |
11873 | return std::nullopt; |
11874 | |
11875 | uint64_t ConcatSizeInBytes = BitsProvided / 4; |
11876 | uint64_t ByteShift = BitShift / 8; |
11877 | |
11878 | uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; |
11879 | uint64_t BytesProvided = BitsProvided / 8; |
11880 | SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1); |
11881 | NewIndex %= BytesProvided; |
11882 | return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex); |
11883 | } |
11884 | |
11885 | case ISD::SRA: |
11886 | case ISD::SRL: { |
11887 | if (IsVec) |
11888 | return std::nullopt; |
11889 | |
11890 | auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11891 | if (!ShiftOp) |
11892 | return std::nullopt; |
11893 | |
11894 | uint64_t BitShift = ShiftOp->getZExtValue(); |
11895 | if (BitShift % 8) |
11896 | return std::nullopt; |
11897 | |
11898 | auto BitsProvided = Op.getScalarValueSizeInBits(); |
11899 | if (BitsProvided % 8 != 0) |
11900 | return std::nullopt; |
11901 | |
11902 | uint64_t BytesProvided = BitsProvided / 8; |
11903 | uint64_t ByteShift = BitShift / 8; |
11904 | // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. |
11905 | // If the byte we are trying to provide (as tracked by index) falls in this |
11906 | // range, then the SRL provides the byte. The byte of interest of the src of |
11907 | // the SRL is Index + ByteShift |
11908 | return BytesProvided - ByteShift > Index |
11909 | ? calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, |
11910 | SrcIndex: Index + ByteShift) |
11911 | : ByteProvider<SDValue>::getConstantZero(); |
11912 | } |
11913 | |
11914 | case ISD::SHL: { |
11915 | if (IsVec) |
11916 | return std::nullopt; |
11917 | |
11918 | auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
11919 | if (!ShiftOp) |
11920 | return std::nullopt; |
11921 | |
11922 | uint64_t BitShift = ShiftOp->getZExtValue(); |
11923 | if (BitShift % 8 != 0) |
11924 | return std::nullopt; |
11925 | uint64_t ByteShift = BitShift / 8; |
11926 | |
11927 | // If we are shifting by an amount greater than (or equal to) |
11928 | // the index we are trying to provide, then it provides 0s. If not, |
11929 | // then this bytes are not definitively 0s, and the corresponding byte |
11930 | // of interest is Index - ByteShift of the src |
11931 | return Index < ByteShift |
11932 | ? ByteProvider<SDValue>::getConstantZero() |
11933 | : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift, |
11934 | Depth: Depth + 1, StartingIndex); |
11935 | } |
11936 | case ISD::ANY_EXTEND: |
11937 | case ISD::SIGN_EXTEND: |
11938 | case ISD::ZERO_EXTEND: |
11939 | case ISD::SIGN_EXTEND_INREG: |
11940 | case ISD::AssertZext: |
11941 | case ISD::AssertSext: { |
11942 | if (IsVec) |
11943 | return std::nullopt; |
11944 | |
11945 | SDValue NarrowOp = Op->getOperand(Num: 0); |
11946 | unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); |
11947 | if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || |
11948 | Op->getOpcode() == ISD::AssertZext || |
11949 | Op->getOpcode() == ISD::AssertSext) { |
11950 | auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1)); |
11951 | NarrowBitWidth = VTSign->getVT().getSizeInBits(); |
11952 | } |
11953 | if (NarrowBitWidth % 8 != 0) |
11954 | return std::nullopt; |
11955 | uint64_t NarrowByteWidth = NarrowBitWidth / 8; |
11956 | |
11957 | if (Index >= NarrowByteWidth) |
11958 | return Op.getOpcode() == ISD::ZERO_EXTEND |
11959 | ? std::optional<ByteProvider<SDValue>>( |
11960 | ByteProvider<SDValue>::getConstantZero()) |
11961 | : std::nullopt; |
11962 | return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex); |
11963 | } |
11964 | |
11965 | case ISD::TRUNCATE: { |
11966 | if (IsVec) |
11967 | return std::nullopt; |
11968 | |
11969 | uint64_t NarrowByteWidth = BitWidth / 8; |
11970 | |
11971 | if (NarrowByteWidth >= Index) { |
11972 | return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1, |
11973 | StartingIndex); |
11974 | } |
11975 | |
11976 | return std::nullopt; |
11977 | } |
11978 | |
11979 | case ISD::CopyFromReg: { |
11980 | if (BitWidth / 8 > Index) |
11981 | return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index); |
11982 | |
11983 | return std::nullopt; |
11984 | } |
11985 | |
11986 | case ISD::LOAD: { |
11987 | auto L = cast<LoadSDNode>(Val: Op.getNode()); |
11988 | |
11989 | unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); |
11990 | if (NarrowBitWidth % 8 != 0) |
11991 | return std::nullopt; |
11992 | uint64_t NarrowByteWidth = NarrowBitWidth / 8; |
11993 | |
11994 | // If the width of the load does not reach byte we are trying to provide for |
11995 | // and it is not a ZEXTLOAD, then the load does not provide for the byte in |
11996 | // question |
11997 | if (Index >= NarrowByteWidth) { |
11998 | return L->getExtensionType() == ISD::ZEXTLOAD |
11999 | ? std::optional<ByteProvider<SDValue>>( |
12000 | ByteProvider<SDValue>::getConstantZero()) |
12001 | : std::nullopt; |
12002 | } |
12003 | |
12004 | if (NarrowByteWidth > Index) { |
12005 | return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index); |
12006 | } |
12007 | |
12008 | return std::nullopt; |
12009 | } |
12010 | |
12011 | case ISD::BSWAP: { |
12012 | if (IsVec) |
12013 | return std::nullopt; |
12014 | |
12015 | return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1, |
12016 | Depth: Depth + 1, StartingIndex); |
12017 | } |
12018 | |
12019 | case ISD::EXTRACT_VECTOR_ELT: { |
12020 | auto IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1)); |
12021 | if (!IdxOp) |
12022 | return std::nullopt; |
12023 | auto VecIdx = IdxOp->getZExtValue(); |
12024 | auto ScalarSize = Op.getScalarValueSizeInBits(); |
12025 | if (ScalarSize < 32) |
12026 | Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; |
12027 | return calculateSrcByte(Op: ScalarSize >= 32 ? Op : Op.getOperand(i: 0), |
12028 | DestByte: StartingIndex, SrcIndex: Index); |
12029 | } |
12030 | |
12031 | case AMDGPUISD::PERM: { |
12032 | if (IsVec) |
12033 | return std::nullopt; |
12034 | |
12035 | auto PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2)); |
12036 | if (!PermMask) |
12037 | return std::nullopt; |
12038 | |
12039 | auto IdxMask = |
12040 | (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); |
12041 | if (IdxMask > 0x07 && IdxMask != 0x0c) |
12042 | return std::nullopt; |
12043 | |
12044 | auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1); |
12045 | auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask; |
12046 | |
12047 | return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex) |
12048 | : ByteProvider<SDValue>( |
12049 | ByteProvider<SDValue>::getConstantZero()); |
12050 | } |
12051 | |
12052 | default: { |
12053 | return std::nullopt; |
12054 | } |
12055 | } |
12056 | |
12057 | llvm_unreachable("fully handled switch" ); |
12058 | } |
12059 | |
12060 | // Returns true if the Operand is a scalar and is 16 bits |
12061 | static bool isExtendedFrom16Bits(SDValue &Operand) { |
12062 | |
12063 | switch (Operand.getOpcode()) { |
12064 | case ISD::ANY_EXTEND: |
12065 | case ISD::SIGN_EXTEND: |
12066 | case ISD::ZERO_EXTEND: { |
12067 | auto OpVT = Operand.getOperand(i: 0).getValueType(); |
12068 | return !OpVT.isVector() && OpVT.getSizeInBits() == 16; |
12069 | } |
12070 | case ISD::LOAD: { |
12071 | LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode()); |
12072 | auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType(); |
12073 | if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD || |
12074 | ExtType == ISD::EXTLOAD) { |
12075 | auto MemVT = L->getMemoryVT(); |
12076 | return !MemVT.isVector() && MemVT.getSizeInBits() == 16; |
12077 | } |
12078 | return L->getMemoryVT().getSizeInBits() == 16; |
12079 | } |
12080 | default: |
12081 | return false; |
12082 | } |
12083 | } |
12084 | |
12085 | // Returns true if the mask matches consecutive bytes, and the first byte |
12086 | // begins at a power of 2 byte offset from 0th byte |
12087 | static bool addresses16Bits(int Mask) { |
12088 | int Low8 = Mask & 0xff; |
12089 | int Hi8 = (Mask & 0xff00) >> 8; |
12090 | |
12091 | assert(Low8 < 8 && Hi8 < 8); |
12092 | // Are the bytes contiguous in the order of increasing addresses. |
12093 | bool IsConsecutive = (Hi8 - Low8 == 1); |
12094 | // Is the first byte at location that is aligned for 16 bit instructions. |
12095 | // A counter example is taking 2 consecutive bytes starting at the 8th bit. |
12096 | // In this case, we still need code to extract the 16 bit operand, so it |
12097 | // is better to use i8 v_perm |
12098 | bool Is16Aligned = !(Low8 % 2); |
12099 | |
12100 | return IsConsecutive && Is16Aligned; |
12101 | } |
12102 | |
12103 | // Do not lower into v_perm if the operands are actually 16 bit |
12104 | // and the selected bits (based on PermMask) correspond with two |
12105 | // easily addressable 16 bit operands. |
12106 | static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, |
12107 | SDValue &OtherOp) { |
12108 | int Low16 = PermMask & 0xffff; |
12109 | int Hi16 = (PermMask & 0xffff0000) >> 16; |
12110 | |
12111 | auto TempOp = peekThroughBitcasts(V: Op); |
12112 | auto TempOtherOp = peekThroughBitcasts(V: OtherOp); |
12113 | |
12114 | auto OpIs16Bit = |
12115 | TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp); |
12116 | if (!OpIs16Bit) |
12117 | return true; |
12118 | |
12119 | auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || |
12120 | isExtendedFrom16Bits(Operand&: TempOtherOp); |
12121 | if (!OtherOpIs16Bit) |
12122 | return true; |
12123 | |
12124 | // Do we cleanly address both |
12125 | return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16); |
12126 | } |
12127 | |
12128 | static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, |
12129 | unsigned DWordOffset) { |
12130 | SDValue Ret; |
12131 | |
12132 | auto TypeSize = Src.getValueSizeInBits().getFixedValue(); |
12133 | // ByteProvider must be at least 8 bits |
12134 | assert(Src.getValueSizeInBits().isKnownMultipleOf(8)); |
12135 | |
12136 | if (TypeSize <= 32) |
12137 | return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32); |
12138 | |
12139 | if (Src.getValueType().isVector()) { |
12140 | auto ScalarTySize = Src.getScalarValueSizeInBits(); |
12141 | auto ScalarTy = Src.getValueType().getScalarType(); |
12142 | if (ScalarTySize == 32) { |
12143 | return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src, |
12144 | N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32)); |
12145 | } |
12146 | if (ScalarTySize > 32) { |
12147 | Ret = DAG.getNode( |
12148 | Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src, |
12149 | N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / 32), DL: SL, VT: MVT::i32)); |
12150 | auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32)); |
12151 | if (ShiftVal) |
12152 | Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret, |
12153 | N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32)); |
12154 | return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32); |
12155 | } |
12156 | |
12157 | assert(ScalarTySize < 32); |
12158 | auto NumElements = TypeSize / ScalarTySize; |
12159 | auto Trunc32Elements = (ScalarTySize * NumElements) / 32; |
12160 | auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize; |
12161 | auto NumElementsIn32 = 32 / ScalarTySize; |
12162 | auto NumAvailElements = DWordOffset < Trunc32Elements |
12163 | ? NumElementsIn32 |
12164 | : NumElements - NormalizedTrunc; |
12165 | |
12166 | SmallVector<SDValue, 4> VecSrcs; |
12167 | DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32, |
12168 | Count: NumAvailElements); |
12169 | |
12170 | Ret = DAG.getBuildVector( |
12171 | VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL, |
12172 | Ops: VecSrcs); |
12173 | return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32); |
12174 | } |
12175 | |
12176 | /// Scalar Type |
12177 | auto ShiftVal = 32 * DWordOffset; |
12178 | Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src, |
12179 | N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32)); |
12180 | return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32); |
12181 | } |
12182 | |
12183 | static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { |
12184 | SelectionDAG &DAG = DCI.DAG; |
12185 | [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0); |
12186 | SmallVector<ByteProvider<SDValue>, 8> PermNodes; |
12187 | |
12188 | // VT is known to be MVT::i32, so we need to provide 4 bytes. |
12189 | assert(VT == MVT::i32); |
12190 | for (int i = 0; i < 4; i++) { |
12191 | // Find the ByteProvider that provides the ith byte of the result of OR |
12192 | std::optional<ByteProvider<SDValue>> P = |
12193 | calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i); |
12194 | // TODO support constantZero |
12195 | if (!P || P->isConstantZero()) |
12196 | return SDValue(); |
12197 | |
12198 | PermNodes.push_back(Elt: *P); |
12199 | } |
12200 | if (PermNodes.size() != 4) |
12201 | return SDValue(); |
12202 | |
12203 | std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4); |
12204 | std::optional<std::pair<unsigned, unsigned>> SecondSrc; |
12205 | uint64_t PermMask = 0x00000000; |
12206 | for (size_t i = 0; i < PermNodes.size(); i++) { |
12207 | auto PermOp = PermNodes[i]; |
12208 | // Since the mask is applied to Src1:Src2, Src1 bytes must be offset |
12209 | // by sizeof(Src2) = 4 |
12210 | int SrcByteAdjust = 4; |
12211 | |
12212 | // If the Src uses a byte from a different DWORD, then it corresponds |
12213 | // with a difference source |
12214 | if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) || |
12215 | ((PermOp.SrcOffset / 4) != FirstSrc.second)) { |
12216 | if (SecondSrc) |
12217 | if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) || |
12218 | ((PermOp.SrcOffset / 4) != SecondSrc->second)) |
12219 | return SDValue(); |
12220 | |
12221 | // Set the index of the second distinct Src node |
12222 | SecondSrc = {i, PermNodes[i].SrcOffset / 4}; |
12223 | assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8)); |
12224 | SrcByteAdjust = 0; |
12225 | } |
12226 | assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8); |
12227 | assert(!DAG.getDataLayout().isBigEndian()); |
12228 | PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8); |
12229 | } |
12230 | SDLoc DL(N); |
12231 | SDValue Op = *PermNodes[FirstSrc.first].Src; |
12232 | Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second); |
12233 | assert(Op.getValueSizeInBits() == 32); |
12234 | |
12235 | // Check that we are not just extracting the bytes in order from an op |
12236 | if (!SecondSrc) { |
12237 | int Low16 = PermMask & 0xffff; |
12238 | int Hi16 = (PermMask & 0xffff0000) >> 16; |
12239 | |
12240 | bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); |
12241 | bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); |
12242 | |
12243 | // The perm op would really just produce Op. So combine into Op |
12244 | if (WellFormedLow && WellFormedHi) |
12245 | return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op); |
12246 | } |
12247 | |
12248 | SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op; |
12249 | |
12250 | if (SecondSrc) { |
12251 | OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second); |
12252 | assert(OtherOp.getValueSizeInBits() == 32); |
12253 | } |
12254 | |
12255 | if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { |
12256 | |
12257 | assert(Op.getValueType().isByteSized() && |
12258 | OtherOp.getValueType().isByteSized()); |
12259 | |
12260 | // If the ultimate src is less than 32 bits, then we will only be |
12261 | // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. |
12262 | // CalculateByteProvider would not have returned Op as source if we |
12263 | // used a byte that is outside its ValueType. Thus, we are free to |
12264 | // ANY_EXTEND as the extended bits are dont-cares. |
12265 | Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32); |
12266 | OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32); |
12267 | |
12268 | return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp, |
12269 | N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32)); |
12270 | } |
12271 | return SDValue(); |
12272 | } |
12273 | |
12274 | SDValue SITargetLowering::performOrCombine(SDNode *N, |
12275 | DAGCombinerInfo &DCI) const { |
12276 | SelectionDAG &DAG = DCI.DAG; |
12277 | SDValue LHS = N->getOperand(Num: 0); |
12278 | SDValue RHS = N->getOperand(Num: 1); |
12279 | |
12280 | EVT VT = N->getValueType(ResNo: 0); |
12281 | if (VT == MVT::i1) { |
12282 | // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) |
12283 | if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && |
12284 | RHS.getOpcode() == AMDGPUISD::FP_CLASS) { |
12285 | SDValue Src = LHS.getOperand(i: 0); |
12286 | if (Src != RHS.getOperand(i: 0)) |
12287 | return SDValue(); |
12288 | |
12289 | const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1)); |
12290 | const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1)); |
12291 | if (!CLHS || !CRHS) |
12292 | return SDValue(); |
12293 | |
12294 | // Only 10 bits are used. |
12295 | static const uint32_t MaxMask = 0x3ff; |
12296 | |
12297 | uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; |
12298 | SDLoc DL(N); |
12299 | return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, |
12300 | N1: Src, N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32)); |
12301 | } |
12302 | |
12303 | return SDValue(); |
12304 | } |
12305 | |
12306 | // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) |
12307 | if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() && |
12308 | LHS.getOpcode() == AMDGPUISD::PERM && |
12309 | isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) { |
12310 | uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1)); |
12311 | if (!Sel) |
12312 | return SDValue(); |
12313 | |
12314 | Sel |= LHS.getConstantOperandVal(i: 2); |
12315 | SDLoc DL(N); |
12316 | return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0), |
12317 | N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32)); |
12318 | } |
12319 | |
12320 | // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) |
12321 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
12322 | if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && |
12323 | N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) { |
12324 | |
12325 | // If all the uses of an or need to extract the individual elements, do not |
12326 | // attempt to lower into v_perm |
12327 | auto usesCombinedOperand = [](SDNode *OrUse) { |
12328 | // If we have any non-vectorized use, then it is a candidate for v_perm |
12329 | if (OrUse->getOpcode() != ISD::BITCAST || |
12330 | !OrUse->getValueType(ResNo: 0).isVector()) |
12331 | return true; |
12332 | |
12333 | // If we have any non-vectorized use, then it is a candidate for v_perm |
12334 | for (auto VUse : OrUse->uses()) { |
12335 | if (!VUse->getValueType(ResNo: 0).isVector()) |
12336 | return true; |
12337 | |
12338 | // If the use of a vector is a store, then combining via a v_perm |
12339 | // is beneficial. |
12340 | // TODO -- whitelist more uses |
12341 | for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg}) |
12342 | if (VUse->getOpcode() == VectorwiseOp) |
12343 | return true; |
12344 | } |
12345 | return false; |
12346 | }; |
12347 | |
12348 | if (!any_of(Range: N->uses(), P: usesCombinedOperand)) |
12349 | return SDValue(); |
12350 | |
12351 | uint32_t LHSMask = getPermuteMask(V: LHS); |
12352 | uint32_t RHSMask = getPermuteMask(V: RHS); |
12353 | |
12354 | if (LHSMask != ~0u && RHSMask != ~0u) { |
12355 | // Canonicalize the expression in an attempt to have fewer unique masks |
12356 | // and therefore fewer registers used to hold the masks. |
12357 | if (LHSMask > RHSMask) { |
12358 | std::swap(a&: LHSMask, b&: RHSMask); |
12359 | std::swap(a&: LHS, b&: RHS); |
12360 | } |
12361 | |
12362 | // Select 0xc for each lane used from source operand. Zero has 0xc mask |
12363 | // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. |
12364 | uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; |
12365 | uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; |
12366 | |
12367 | // Check of we need to combine values from two sources within a byte. |
12368 | if (!(LHSUsedLanes & RHSUsedLanes) && |
12369 | // If we select high and lower word keep it for SDWA. |
12370 | // TODO: teach SDWA to work with v_perm_b32 and remove the check. |
12371 | !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { |
12372 | // Kill zero bytes selected by other mask. Zero value is 0xc. |
12373 | LHSMask &= ~RHSUsedLanes; |
12374 | RHSMask &= ~LHSUsedLanes; |
12375 | // Add 4 to each active LHS lane |
12376 | LHSMask |= LHSUsedLanes & 0x04040404; |
12377 | // Combine masks |
12378 | uint32_t Sel = LHSMask | RHSMask; |
12379 | SDLoc DL(N); |
12380 | |
12381 | return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, |
12382 | N1: LHS.getOperand(i: 0), N2: RHS.getOperand(i: 0), |
12383 | N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32)); |
12384 | } |
12385 | } |
12386 | if (LHSMask == ~0u || RHSMask == ~0u) { |
12387 | if (SDValue Perm = matchPERM(N, DCI)) |
12388 | return Perm; |
12389 | } |
12390 | } |
12391 | |
12392 | if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) |
12393 | return SDValue(); |
12394 | |
12395 | // TODO: This could be a generic combine with a predicate for extracting the |
12396 | // high half of an integer being free. |
12397 | |
12398 | // (or i64:x, (zero_extend i32:y)) -> |
12399 | // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) |
12400 | if (LHS.getOpcode() == ISD::ZERO_EXTEND && |
12401 | RHS.getOpcode() != ISD::ZERO_EXTEND) |
12402 | std::swap(a&: LHS, b&: RHS); |
12403 | |
12404 | if (RHS.getOpcode() == ISD::ZERO_EXTEND) { |
12405 | SDValue ExtSrc = RHS.getOperand(i: 0); |
12406 | EVT SrcVT = ExtSrc.getValueType(); |
12407 | if (SrcVT == MVT::i32) { |
12408 | SDLoc SL(N); |
12409 | SDValue LowLHS, HiBits; |
12410 | std::tie(args&: LowLHS, args&: HiBits) = split64BitValue(Op: LHS, DAG); |
12411 | SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc); |
12412 | |
12413 | DCI.AddToWorklist(N: LowOr.getNode()); |
12414 | DCI.AddToWorklist(N: HiBits.getNode()); |
12415 | |
12416 | SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, |
12417 | N1: LowOr, N2: HiBits); |
12418 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec); |
12419 | } |
12420 | } |
12421 | |
12422 | const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)); |
12423 | if (CRHS) { |
12424 | if (SDValue Split |
12425 | = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR, |
12426 | LHS: N->getOperand(Num: 0), CRHS)) |
12427 | return Split; |
12428 | } |
12429 | |
12430 | return SDValue(); |
12431 | } |
12432 | |
12433 | SDValue SITargetLowering::performXorCombine(SDNode *N, |
12434 | DAGCombinerInfo &DCI) const { |
12435 | if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG)) |
12436 | return RV; |
12437 | |
12438 | SDValue LHS = N->getOperand(Num: 0); |
12439 | SDValue RHS = N->getOperand(Num: 1); |
12440 | |
12441 | const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS); |
12442 | SelectionDAG &DAG = DCI.DAG; |
12443 | |
12444 | EVT VT = N->getValueType(ResNo: 0); |
12445 | if (CRHS && VT == MVT::i64) { |
12446 | if (SDValue Split |
12447 | = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS)) |
12448 | return Split; |
12449 | } |
12450 | |
12451 | // Make sure to apply the 64-bit constant splitting fold before trying to fold |
12452 | // fneg-like xors into 64-bit select. |
12453 | if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { |
12454 | // This looks like an fneg, try to fold as a source modifier. |
12455 | if (CRHS && CRHS->getAPIntValue().isSignMask() && |
12456 | shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) { |
12457 | // xor (select c, a, b), 0x80000000 -> |
12458 | // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b))) |
12459 | SDLoc DL(N); |
12460 | SDValue CastLHS = |
12461 | DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 1)); |
12462 | SDValue CastRHS = |
12463 | DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 2)); |
12464 | SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS); |
12465 | SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS); |
12466 | SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32, |
12467 | N1: LHS->getOperand(Num: 0), N2: FNegLHS, N3: FNegRHS); |
12468 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect); |
12469 | } |
12470 | } |
12471 | |
12472 | return SDValue(); |
12473 | } |
12474 | |
12475 | SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, |
12476 | DAGCombinerInfo &DCI) const { |
12477 | if (!Subtarget->has16BitInsts() || |
12478 | DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
12479 | return SDValue(); |
12480 | |
12481 | EVT VT = N->getValueType(ResNo: 0); |
12482 | if (VT != MVT::i32) |
12483 | return SDValue(); |
12484 | |
12485 | SDValue Src = N->getOperand(Num: 0); |
12486 | if (Src.getValueType() != MVT::i16) |
12487 | return SDValue(); |
12488 | |
12489 | return SDValue(); |
12490 | } |
12491 | |
12492 | SDValue |
12493 | SITargetLowering::performSignExtendInRegCombine(SDNode *N, |
12494 | DAGCombinerInfo &DCI) const { |
12495 | SDValue Src = N->getOperand(Num: 0); |
12496 | auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1)); |
12497 | |
12498 | // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them |
12499 | // with s_buffer_load_i8 and s_buffer_load_i16 respectively. |
12500 | if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE && |
12501 | VTSign->getVT() == MVT::i8) || |
12502 | (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT && |
12503 | VTSign->getVT() == MVT::i16))) { |
12504 | assert(Subtarget->hasScalarSubwordLoads() && |
12505 | "s_buffer_load_{u8, i8} are supported " |
12506 | "in GFX12 (or newer) architectures." ); |
12507 | EVT VT = Src.getValueType(); |
12508 | unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE) |
12509 | ? AMDGPUISD::SBUFFER_LOAD_BYTE |
12510 | : AMDGPUISD::SBUFFER_LOAD_SHORT; |
12511 | SDLoc DL(N); |
12512 | SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32); |
12513 | SDValue Ops[] = { |
12514 | Src.getOperand(i: 0), // source register |
12515 | Src.getOperand(i: 1), // offset |
12516 | Src.getOperand(i: 2) // cachePolicy |
12517 | }; |
12518 | auto *M = cast<MemSDNode>(Val&: Src); |
12519 | SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode( |
12520 | Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand()); |
12521 | SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad); |
12522 | return LoadVal; |
12523 | } |
12524 | if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && |
12525 | VTSign->getVT() == MVT::i8) || |
12526 | (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && |
12527 | VTSign->getVT() == MVT::i16)) && |
12528 | Src.hasOneUse()) { |
12529 | auto *M = cast<MemSDNode>(Val&: Src); |
12530 | SDValue Ops[] = { |
12531 | Src.getOperand(i: 0), // Chain |
12532 | Src.getOperand(i: 1), // rsrc |
12533 | Src.getOperand(i: 2), // vindex |
12534 | Src.getOperand(i: 3), // voffset |
12535 | Src.getOperand(i: 4), // soffset |
12536 | Src.getOperand(i: 5), // offset |
12537 | Src.getOperand(i: 6), |
12538 | Src.getOperand(i: 7) |
12539 | }; |
12540 | // replace with BUFFER_LOAD_BYTE/SHORT |
12541 | SDVTList ResList = DCI.DAG.getVTList(VT1: MVT::i32, |
12542 | VT2: Src.getOperand(i: 0).getValueType()); |
12543 | unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ? |
12544 | AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT; |
12545 | SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opcode: Opc, dl: SDLoc(N), |
12546 | VTList: ResList, |
12547 | Ops, MemVT: M->getMemoryVT(), |
12548 | MMO: M->getMemOperand()); |
12549 | return DCI.DAG.getMergeValues(Ops: {BufferLoadSignExt, |
12550 | BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N)); |
12551 | } |
12552 | return SDValue(); |
12553 | } |
12554 | |
12555 | SDValue SITargetLowering::performClassCombine(SDNode *N, |
12556 | DAGCombinerInfo &DCI) const { |
12557 | SelectionDAG &DAG = DCI.DAG; |
12558 | SDValue Mask = N->getOperand(Num: 1); |
12559 | |
12560 | // fp_class x, 0 -> false |
12561 | if (isNullConstant(V: Mask)) |
12562 | return DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1); |
12563 | |
12564 | if (N->getOperand(Num: 0).isUndef()) |
12565 | return DAG.getUNDEF(VT: MVT::i1); |
12566 | |
12567 | return SDValue(); |
12568 | } |
12569 | |
12570 | SDValue SITargetLowering::performRcpCombine(SDNode *N, |
12571 | DAGCombinerInfo &DCI) const { |
12572 | EVT VT = N->getValueType(ResNo: 0); |
12573 | SDValue N0 = N->getOperand(Num: 0); |
12574 | |
12575 | if (N0.isUndef()) { |
12576 | return DCI.DAG.getConstantFP( |
12577 | Val: APFloat::getQNaN(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)), DL: SDLoc(N), |
12578 | VT); |
12579 | } |
12580 | |
12581 | if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || |
12582 | N0.getOpcode() == ISD::SINT_TO_FP)) { |
12583 | return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc(N), VT, Operand: N0, |
12584 | Flags: N->getFlags()); |
12585 | } |
12586 | |
12587 | // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. |
12588 | if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && |
12589 | N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { |
12590 | return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT, |
12591 | Operand: N0.getOperand(i: 0), Flags: N->getFlags()); |
12592 | } |
12593 | |
12594 | return AMDGPUTargetLowering::performRcpCombine(N, DCI); |
12595 | } |
12596 | |
12597 | bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, |
12598 | unsigned MaxDepth) const { |
12599 | unsigned Opcode = Op.getOpcode(); |
12600 | if (Opcode == ISD::FCANONICALIZE) |
12601 | return true; |
12602 | |
12603 | if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) { |
12604 | const auto &F = CFP->getValueAPF(); |
12605 | if (F.isNaN() && F.isSignaling()) |
12606 | return false; |
12607 | if (!F.isDenormal()) |
12608 | return true; |
12609 | |
12610 | DenormalMode Mode = |
12611 | DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics()); |
12612 | return Mode == DenormalMode::getIEEE(); |
12613 | } |
12614 | |
12615 | // If source is a result of another standard FP operation it is already in |
12616 | // canonical form. |
12617 | if (MaxDepth == 0) |
12618 | return false; |
12619 | |
12620 | switch (Opcode) { |
12621 | // These will flush denorms if required. |
12622 | case ISD::FADD: |
12623 | case ISD::FSUB: |
12624 | case ISD::FMUL: |
12625 | case ISD::FCEIL: |
12626 | case ISD::FFLOOR: |
12627 | case ISD::FMA: |
12628 | case ISD::FMAD: |
12629 | case ISD::FSQRT: |
12630 | case ISD::FDIV: |
12631 | case ISD::FREM: |
12632 | case ISD::FP_ROUND: |
12633 | case ISD::FP_EXTEND: |
12634 | case ISD::FP16_TO_FP: |
12635 | case ISD::FP_TO_FP16: |
12636 | case ISD::BF16_TO_FP: |
12637 | case ISD::FP_TO_BF16: |
12638 | case ISD::FLDEXP: |
12639 | case AMDGPUISD::FMUL_LEGACY: |
12640 | case AMDGPUISD::FMAD_FTZ: |
12641 | case AMDGPUISD::RCP: |
12642 | case AMDGPUISD::RSQ: |
12643 | case AMDGPUISD::RSQ_CLAMP: |
12644 | case AMDGPUISD::RCP_LEGACY: |
12645 | case AMDGPUISD::RCP_IFLAG: |
12646 | case AMDGPUISD::LOG: |
12647 | case AMDGPUISD::EXP: |
12648 | case AMDGPUISD::DIV_SCALE: |
12649 | case AMDGPUISD::DIV_FMAS: |
12650 | case AMDGPUISD::DIV_FIXUP: |
12651 | case AMDGPUISD::FRACT: |
12652 | case AMDGPUISD::CVT_PKRTZ_F16_F32: |
12653 | case AMDGPUISD::CVT_F32_UBYTE0: |
12654 | case AMDGPUISD::CVT_F32_UBYTE1: |
12655 | case AMDGPUISD::CVT_F32_UBYTE2: |
12656 | case AMDGPUISD::CVT_F32_UBYTE3: |
12657 | case AMDGPUISD::FP_TO_FP16: |
12658 | case AMDGPUISD::SIN_HW: |
12659 | case AMDGPUISD::COS_HW: |
12660 | return true; |
12661 | |
12662 | // It can/will be lowered or combined as a bit operation. |
12663 | // Need to check their input recursively to handle. |
12664 | case ISD::FNEG: |
12665 | case ISD::FABS: |
12666 | case ISD::FCOPYSIGN: |
12667 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12668 | |
12669 | case ISD::AND: |
12670 | if (Op.getValueType() == MVT::i32) { |
12671 | // Be careful as we only know it is a bitcast floating point type. It |
12672 | // could be f32, v2f16, we have no way of knowing. Luckily the constant |
12673 | // value that we optimize for, which comes up in fp32 to bf16 conversions, |
12674 | // is valid to optimize for all types. |
12675 | if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) { |
12676 | if (RHS->getZExtValue() == 0xffff0000) { |
12677 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12678 | } |
12679 | } |
12680 | } |
12681 | break; |
12682 | |
12683 | case ISD::FSIN: |
12684 | case ISD::FCOS: |
12685 | case ISD::FSINCOS: |
12686 | return Op.getValueType().getScalarType() != MVT::f16; |
12687 | |
12688 | case ISD::FMINNUM: |
12689 | case ISD::FMAXNUM: |
12690 | case ISD::FMINNUM_IEEE: |
12691 | case ISD::FMAXNUM_IEEE: |
12692 | case ISD::FMINIMUM: |
12693 | case ISD::FMAXIMUM: |
12694 | case AMDGPUISD::CLAMP: |
12695 | case AMDGPUISD::FMED3: |
12696 | case AMDGPUISD::FMAX3: |
12697 | case AMDGPUISD::FMIN3: |
12698 | case AMDGPUISD::FMAXIMUM3: |
12699 | case AMDGPUISD::FMINIMUM3: { |
12700 | // FIXME: Shouldn't treat the generic operations different based these. |
12701 | // However, we aren't really required to flush the result from |
12702 | // minnum/maxnum.. |
12703 | |
12704 | // snans will be quieted, so we only need to worry about denormals. |
12705 | if (Subtarget->supportsMinMaxDenormModes() || |
12706 | // FIXME: denormalsEnabledForType is broken for dynamic |
12707 | denormalsEnabledForType(DAG, VT: Op.getValueType())) |
12708 | return true; |
12709 | |
12710 | // Flushing may be required. |
12711 | // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such |
12712 | // targets need to check their input recursively. |
12713 | |
12714 | // FIXME: Does this apply with clamp? It's implemented with max. |
12715 | for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { |
12716 | if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), MaxDepth: MaxDepth - 1)) |
12717 | return false; |
12718 | } |
12719 | |
12720 | return true; |
12721 | } |
12722 | case ISD::SELECT: { |
12723 | return isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1) && |
12724 | isCanonicalized(DAG, Op: Op.getOperand(i: 2), MaxDepth: MaxDepth - 1); |
12725 | } |
12726 | case ISD::BUILD_VECTOR: { |
12727 | for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { |
12728 | SDValue SrcOp = Op.getOperand(i); |
12729 | if (!isCanonicalized(DAG, Op: SrcOp, MaxDepth: MaxDepth - 1)) |
12730 | return false; |
12731 | } |
12732 | |
12733 | return true; |
12734 | } |
12735 | case ISD::EXTRACT_VECTOR_ELT: |
12736 | case ISD::EXTRACT_SUBVECTOR: { |
12737 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12738 | } |
12739 | case ISD::INSERT_VECTOR_ELT: { |
12740 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1) && |
12741 | isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1); |
12742 | } |
12743 | case ISD::UNDEF: |
12744 | // Could be anything. |
12745 | return false; |
12746 | |
12747 | case ISD::BITCAST: |
12748 | // TODO: This is incorrect as it loses track of the operand's type. We may |
12749 | // end up effectively bitcasting from f32 to v2f16 or vice versa, and the |
12750 | // same bits that are canonicalized in one type need not be in the other. |
12751 | return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12752 | case ISD::TRUNCATE: { |
12753 | // Hack round the mess we make when legalizing extract_vector_elt |
12754 | if (Op.getValueType() == MVT::i16) { |
12755 | SDValue TruncSrc = Op.getOperand(i: 0); |
12756 | if (TruncSrc.getValueType() == MVT::i32 && |
12757 | TruncSrc.getOpcode() == ISD::BITCAST && |
12758 | TruncSrc.getOperand(i: 0).getValueType() == MVT::v2f16) { |
12759 | return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), MaxDepth: MaxDepth - 1); |
12760 | } |
12761 | } |
12762 | return false; |
12763 | } |
12764 | case ISD::INTRINSIC_WO_CHAIN: { |
12765 | unsigned IntrinsicID = Op.getConstantOperandVal(i: 0); |
12766 | // TODO: Handle more intrinsics |
12767 | switch (IntrinsicID) { |
12768 | case Intrinsic::amdgcn_cvt_pkrtz: |
12769 | case Intrinsic::amdgcn_cubeid: |
12770 | case Intrinsic::amdgcn_frexp_mant: |
12771 | case Intrinsic::amdgcn_fdot2: |
12772 | case Intrinsic::amdgcn_rcp: |
12773 | case Intrinsic::amdgcn_rsq: |
12774 | case Intrinsic::amdgcn_rsq_clamp: |
12775 | case Intrinsic::amdgcn_rcp_legacy: |
12776 | case Intrinsic::amdgcn_rsq_legacy: |
12777 | case Intrinsic::amdgcn_trig_preop: |
12778 | case Intrinsic::amdgcn_log: |
12779 | case Intrinsic::amdgcn_exp2: |
12780 | case Intrinsic::amdgcn_sqrt: |
12781 | return true; |
12782 | default: |
12783 | break; |
12784 | } |
12785 | |
12786 | break; |
12787 | } |
12788 | default: |
12789 | break; |
12790 | } |
12791 | |
12792 | // FIXME: denormalsEnabledForType is broken for dynamic |
12793 | return denormalsEnabledForType(DAG, VT: Op.getValueType()) && |
12794 | DAG.isKnownNeverSNaN(Op); |
12795 | } |
12796 | |
12797 | bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, |
12798 | unsigned MaxDepth) const { |
12799 | const MachineRegisterInfo &MRI = MF.getRegInfo(); |
12800 | MachineInstr *MI = MRI.getVRegDef(Reg); |
12801 | unsigned Opcode = MI->getOpcode(); |
12802 | |
12803 | if (Opcode == AMDGPU::G_FCANONICALIZE) |
12804 | return true; |
12805 | |
12806 | std::optional<FPValueAndVReg> FCR; |
12807 | // Constant splat (can be padded with undef) or scalar constant. |
12808 | if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) { |
12809 | if (FCR->Value.isSignaling()) |
12810 | return false; |
12811 | if (!FCR->Value.isDenormal()) |
12812 | return true; |
12813 | |
12814 | DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics()); |
12815 | return Mode == DenormalMode::getIEEE(); |
12816 | } |
12817 | |
12818 | if (MaxDepth == 0) |
12819 | return false; |
12820 | |
12821 | switch (Opcode) { |
12822 | case AMDGPU::G_FADD: |
12823 | case AMDGPU::G_FSUB: |
12824 | case AMDGPU::G_FMUL: |
12825 | case AMDGPU::G_FCEIL: |
12826 | case AMDGPU::G_FFLOOR: |
12827 | case AMDGPU::G_FRINT: |
12828 | case AMDGPU::G_FNEARBYINT: |
12829 | case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND: |
12830 | case AMDGPU::G_INTRINSIC_TRUNC: |
12831 | case AMDGPU::G_INTRINSIC_ROUNDEVEN: |
12832 | case AMDGPU::G_FMA: |
12833 | case AMDGPU::G_FMAD: |
12834 | case AMDGPU::G_FSQRT: |
12835 | case AMDGPU::G_FDIV: |
12836 | case AMDGPU::G_FREM: |
12837 | case AMDGPU::G_FPOW: |
12838 | case AMDGPU::G_FPEXT: |
12839 | case AMDGPU::G_FLOG: |
12840 | case AMDGPU::G_FLOG2: |
12841 | case AMDGPU::G_FLOG10: |
12842 | case AMDGPU::G_FPTRUNC: |
12843 | case AMDGPU::G_AMDGPU_RCP_IFLAG: |
12844 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: |
12845 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: |
12846 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: |
12847 | case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: |
12848 | return true; |
12849 | case AMDGPU::G_FNEG: |
12850 | case AMDGPU::G_FABS: |
12851 | case AMDGPU::G_FCOPYSIGN: |
12852 | return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1); |
12853 | case AMDGPU::G_FMINNUM: |
12854 | case AMDGPU::G_FMAXNUM: |
12855 | case AMDGPU::G_FMINNUM_IEEE: |
12856 | case AMDGPU::G_FMAXNUM_IEEE: |
12857 | case AMDGPU::G_FMINIMUM: |
12858 | case AMDGPU::G_FMAXIMUM: { |
12859 | if (Subtarget->supportsMinMaxDenormModes() || |
12860 | // FIXME: denormalsEnabledForType is broken for dynamic |
12861 | denormalsEnabledForType(Ty: MRI.getType(Reg), MF)) |
12862 | return true; |
12863 | |
12864 | [[fallthrough]]; |
12865 | } |
12866 | case AMDGPU::G_BUILD_VECTOR: |
12867 | for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands())) |
12868 | if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1)) |
12869 | return false; |
12870 | return true; |
12871 | case AMDGPU::G_INTRINSIC: |
12872 | case AMDGPU::G_INTRINSIC_CONVERGENT: |
12873 | switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) { |
12874 | case Intrinsic::amdgcn_fmul_legacy: |
12875 | case Intrinsic::amdgcn_fmad_ftz: |
12876 | case Intrinsic::amdgcn_sqrt: |
12877 | case Intrinsic::amdgcn_fmed3: |
12878 | case Intrinsic::amdgcn_sin: |
12879 | case Intrinsic::amdgcn_cos: |
12880 | case Intrinsic::amdgcn_log: |
12881 | case Intrinsic::amdgcn_exp2: |
12882 | case Intrinsic::amdgcn_log_clamp: |
12883 | case Intrinsic::amdgcn_rcp: |
12884 | case Intrinsic::amdgcn_rcp_legacy: |
12885 | case Intrinsic::amdgcn_rsq: |
12886 | case Intrinsic::amdgcn_rsq_clamp: |
12887 | case Intrinsic::amdgcn_rsq_legacy: |
12888 | case Intrinsic::amdgcn_div_scale: |
12889 | case Intrinsic::amdgcn_div_fmas: |
12890 | case Intrinsic::amdgcn_div_fixup: |
12891 | case Intrinsic::amdgcn_fract: |
12892 | case Intrinsic::amdgcn_cvt_pkrtz: |
12893 | case Intrinsic::amdgcn_cubeid: |
12894 | case Intrinsic::amdgcn_cubema: |
12895 | case Intrinsic::amdgcn_cubesc: |
12896 | case Intrinsic::amdgcn_cubetc: |
12897 | case Intrinsic::amdgcn_frexp_mant: |
12898 | case Intrinsic::amdgcn_fdot2: |
12899 | case Intrinsic::amdgcn_trig_preop: |
12900 | return true; |
12901 | default: |
12902 | break; |
12903 | } |
12904 | |
12905 | [[fallthrough]]; |
12906 | default: |
12907 | return false; |
12908 | } |
12909 | |
12910 | llvm_unreachable("invalid operation" ); |
12911 | } |
12912 | |
12913 | // Constant fold canonicalize. |
12914 | SDValue SITargetLowering::getCanonicalConstantFP( |
12915 | SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { |
12916 | // Flush denormals to 0 if not enabled. |
12917 | if (C.isDenormal()) { |
12918 | DenormalMode Mode = |
12919 | DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics()); |
12920 | if (Mode == DenormalMode::getPreserveSign()) { |
12921 | return DAG.getConstantFP( |
12922 | Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT); |
12923 | } |
12924 | |
12925 | if (Mode != DenormalMode::getIEEE()) |
12926 | return SDValue(); |
12927 | } |
12928 | |
12929 | if (C.isNaN()) { |
12930 | APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics()); |
12931 | if (C.isSignaling()) { |
12932 | // Quiet a signaling NaN. |
12933 | // FIXME: Is this supposed to preserve payload bits? |
12934 | return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT); |
12935 | } |
12936 | |
12937 | // Make sure it is the canonical NaN bitpattern. |
12938 | // |
12939 | // TODO: Can we use -1 as the canonical NaN value since it's an inline |
12940 | // immediate? |
12941 | if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) |
12942 | return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT); |
12943 | } |
12944 | |
12945 | // Already canonical. |
12946 | return DAG.getConstantFP(Val: C, DL: SL, VT); |
12947 | } |
12948 | |
12949 | static bool vectorEltWillFoldAway(SDValue Op) { |
12950 | return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op); |
12951 | } |
12952 | |
12953 | SDValue SITargetLowering::performFCanonicalizeCombine( |
12954 | SDNode *N, |
12955 | DAGCombinerInfo &DCI) const { |
12956 | SelectionDAG &DAG = DCI.DAG; |
12957 | SDValue N0 = N->getOperand(Num: 0); |
12958 | EVT VT = N->getValueType(ResNo: 0); |
12959 | |
12960 | // fcanonicalize undef -> qnan |
12961 | if (N0.isUndef()) { |
12962 | APFloat QNaN = APFloat::getQNaN(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)); |
12963 | return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT); |
12964 | } |
12965 | |
12966 | if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) { |
12967 | EVT VT = N->getValueType(ResNo: 0); |
12968 | return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF()); |
12969 | } |
12970 | |
12971 | // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x), |
12972 | // (fcanonicalize k) |
12973 | // |
12974 | // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0 |
12975 | |
12976 | // TODO: This could be better with wider vectors that will be split to v2f16, |
12977 | // and to consider uses since there aren't that many packed operations. |
12978 | if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 && |
12979 | isTypeLegal(VT: MVT::v2f16)) { |
12980 | SDLoc SL(N); |
12981 | SDValue NewElts[2]; |
12982 | SDValue Lo = N0.getOperand(i: 0); |
12983 | SDValue Hi = N0.getOperand(i: 1); |
12984 | EVT EltVT = Lo.getValueType(); |
12985 | |
12986 | if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) { |
12987 | for (unsigned I = 0; I != 2; ++I) { |
12988 | SDValue Op = N0.getOperand(i: I); |
12989 | if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) { |
12990 | NewElts[I] = getCanonicalConstantFP(DAG, SL, VT: EltVT, |
12991 | C: CFP->getValueAPF()); |
12992 | } else if (Op.isUndef()) { |
12993 | // Handled below based on what the other operand is. |
12994 | NewElts[I] = Op; |
12995 | } else { |
12996 | NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op); |
12997 | } |
12998 | } |
12999 | |
13000 | // If one half is undef, and one is constant, prefer a splat vector rather |
13001 | // than the normal qNaN. If it's a register, prefer 0.0 since that's |
13002 | // cheaper to use and may be free with a packed operation. |
13003 | if (NewElts[0].isUndef()) { |
13004 | if (isa<ConstantFPSDNode>(Val: NewElts[1])) |
13005 | NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1]) ? |
13006 | NewElts[1]: DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT); |
13007 | } |
13008 | |
13009 | if (NewElts[1].isUndef()) { |
13010 | NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0]) ? |
13011 | NewElts[0] : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT); |
13012 | } |
13013 | |
13014 | return DAG.getBuildVector(VT, DL: SL, Ops: NewElts); |
13015 | } |
13016 | } |
13017 | |
13018 | return SDValue(); |
13019 | } |
13020 | |
13021 | static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { |
13022 | switch (Opc) { |
13023 | case ISD::FMAXNUM: |
13024 | case ISD::FMAXNUM_IEEE: |
13025 | return AMDGPUISD::FMAX3; |
13026 | case ISD::FMAXIMUM: |
13027 | return AMDGPUISD::FMAXIMUM3; |
13028 | case ISD::SMAX: |
13029 | return AMDGPUISD::SMAX3; |
13030 | case ISD::UMAX: |
13031 | return AMDGPUISD::UMAX3; |
13032 | case ISD::FMINNUM: |
13033 | case ISD::FMINNUM_IEEE: |
13034 | return AMDGPUISD::FMIN3; |
13035 | case ISD::FMINIMUM: |
13036 | return AMDGPUISD::FMINIMUM3; |
13037 | case ISD::SMIN: |
13038 | return AMDGPUISD::SMIN3; |
13039 | case ISD::UMIN: |
13040 | return AMDGPUISD::UMIN3; |
13041 | default: |
13042 | llvm_unreachable("Not a min/max opcode" ); |
13043 | } |
13044 | } |
13045 | |
13046 | SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, |
13047 | const SDLoc &SL, SDValue Src, |
13048 | SDValue MinVal, |
13049 | SDValue MaxVal, |
13050 | bool Signed) const { |
13051 | |
13052 | // med3 comes from |
13053 | // min(max(x, K0), K1), K0 < K1 |
13054 | // max(min(x, K0), K1), K1 < K0 |
13055 | // |
13056 | // "MinVal" and "MaxVal" respectively refer to the rhs of the |
13057 | // min/max op. |
13058 | ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal); |
13059 | ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal); |
13060 | |
13061 | if (!MinK || !MaxK) |
13062 | return SDValue(); |
13063 | |
13064 | if (Signed) { |
13065 | if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue())) |
13066 | return SDValue(); |
13067 | } else { |
13068 | if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue())) |
13069 | return SDValue(); |
13070 | } |
13071 | |
13072 | EVT VT = MinK->getValueType(ResNo: 0); |
13073 | unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; |
13074 | if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) |
13075 | return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal); |
13076 | |
13077 | // Note: we could also extend to i32 and use i32 med3 if i16 med3 is |
13078 | // not available, but this is unlikely to be profitable as constants |
13079 | // will often need to be materialized & extended, especially on |
13080 | // pre-GFX10 where VOP3 instructions couldn't take literal operands. |
13081 | return SDValue(); |
13082 | } |
13083 | |
13084 | static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { |
13085 | if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) |
13086 | return C; |
13087 | |
13088 | if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) { |
13089 | if (ConstantFPSDNode *C = BV->getConstantFPSplatNode()) |
13090 | return C; |
13091 | } |
13092 | |
13093 | return nullptr; |
13094 | } |
13095 | |
13096 | SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, |
13097 | const SDLoc &SL, |
13098 | SDValue Op0, |
13099 | SDValue Op1) const { |
13100 | ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1); |
13101 | if (!K1) |
13102 | return SDValue(); |
13103 | |
13104 | ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1)); |
13105 | if (!K0) |
13106 | return SDValue(); |
13107 | |
13108 | // Ordered >= (although NaN inputs should have folded away by now). |
13109 | if (K0->getValueAPF() > K1->getValueAPF()) |
13110 | return SDValue(); |
13111 | |
13112 | const MachineFunction &MF = DAG.getMachineFunction(); |
13113 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
13114 | |
13115 | // TODO: Check IEEE bit enabled? |
13116 | EVT VT = Op0.getValueType(); |
13117 | if (Info->getMode().DX10Clamp) { |
13118 | // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the |
13119 | // hardware fmed3 behavior converting to a min. |
13120 | // FIXME: Should this be allowing -0.0? |
13121 | if (K1->isExactlyValue(V: 1.0) && K0->isExactlyValue(V: 0.0)) |
13122 | return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0)); |
13123 | } |
13124 | |
13125 | // med3 for f16 is only available on gfx9+, and not available for v2f16. |
13126 | if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { |
13127 | // This isn't safe with signaling NaNs because in IEEE mode, min/max on a |
13128 | // signaling NaN gives a quiet NaN. The quiet NaN input to the min would |
13129 | // then give the other result, which is different from med3 with a NaN |
13130 | // input. |
13131 | SDValue Var = Op0.getOperand(i: 0); |
13132 | if (!DAG.isKnownNeverSNaN(Op: Var)) |
13133 | return SDValue(); |
13134 | |
13135 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
13136 | |
13137 | if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) && |
13138 | (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) { |
13139 | return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0), |
13140 | N1: Var, N2: SDValue(K0, 0), N3: SDValue(K1, 0)); |
13141 | } |
13142 | } |
13143 | |
13144 | return SDValue(); |
13145 | } |
13146 | |
13147 | /// \return true if the subtarget supports minimum3 and maximum3 with the given |
13148 | /// base min/max opcode \p Opc for type \p VT. |
13149 | static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, |
13150 | EVT VT) { |
13151 | switch (Opc) { |
13152 | case ISD::FMINNUM: |
13153 | case ISD::FMAXNUM: |
13154 | case ISD::FMINNUM_IEEE: |
13155 | case ISD::FMAXNUM_IEEE: |
13156 | case AMDGPUISD::FMIN_LEGACY: |
13157 | case AMDGPUISD::FMAX_LEGACY: |
13158 | return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()); |
13159 | case ISD::FMINIMUM: |
13160 | case ISD::FMAXIMUM: |
13161 | return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3(); |
13162 | case ISD::SMAX: |
13163 | case ISD::SMIN: |
13164 | case ISD::UMAX: |
13165 | case ISD::UMIN: |
13166 | return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16()); |
13167 | default: |
13168 | return false; |
13169 | } |
13170 | |
13171 | llvm_unreachable("not a min/max opcode" ); |
13172 | } |
13173 | |
13174 | SDValue SITargetLowering::performMinMaxCombine(SDNode *N, |
13175 | DAGCombinerInfo &DCI) const { |
13176 | SelectionDAG &DAG = DCI.DAG; |
13177 | |
13178 | EVT VT = N->getValueType(ResNo: 0); |
13179 | unsigned Opc = N->getOpcode(); |
13180 | SDValue Op0 = N->getOperand(Num: 0); |
13181 | SDValue Op1 = N->getOperand(Num: 1); |
13182 | |
13183 | // Only do this if the inner op has one use since this will just increases |
13184 | // register pressure for no benefit. |
13185 | |
13186 | if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) { |
13187 | // max(max(a, b), c) -> max3(a, b, c) |
13188 | // min(min(a, b), c) -> min3(a, b, c) |
13189 | if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { |
13190 | SDLoc DL(N); |
13191 | return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), |
13192 | DL, |
13193 | VT: N->getValueType(ResNo: 0), |
13194 | N1: Op0.getOperand(i: 0), |
13195 | N2: Op0.getOperand(i: 1), |
13196 | N3: Op1); |
13197 | } |
13198 | |
13199 | // Try commuted. |
13200 | // max(a, max(b, c)) -> max3(a, b, c) |
13201 | // min(a, min(b, c)) -> min3(a, b, c) |
13202 | if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { |
13203 | SDLoc DL(N); |
13204 | return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), |
13205 | DL, |
13206 | VT: N->getValueType(ResNo: 0), |
13207 | N1: Op0, |
13208 | N2: Op1.getOperand(i: 0), |
13209 | N3: Op1.getOperand(i: 1)); |
13210 | } |
13211 | } |
13212 | |
13213 | // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) |
13214 | // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) |
13215 | if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { |
13216 | if (SDValue Med3 = performIntMed3ImmCombine( |
13217 | DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true)) |
13218 | return Med3; |
13219 | } |
13220 | if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) { |
13221 | if (SDValue Med3 = performIntMed3ImmCombine( |
13222 | DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true)) |
13223 | return Med3; |
13224 | } |
13225 | |
13226 | if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { |
13227 | if (SDValue Med3 = performIntMed3ImmCombine( |
13228 | DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false)) |
13229 | return Med3; |
13230 | } |
13231 | if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) { |
13232 | if (SDValue Med3 = performIntMed3ImmCombine( |
13233 | DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false)) |
13234 | return Med3; |
13235 | } |
13236 | |
13237 | // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) |
13238 | if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || |
13239 | (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || |
13240 | (Opc == AMDGPUISD::FMIN_LEGACY && |
13241 | Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && |
13242 | (VT == MVT::f32 || VT == MVT::f64 || |
13243 | (VT == MVT::f16 && Subtarget->has16BitInsts()) || |
13244 | (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && |
13245 | Op0.hasOneUse()) { |
13246 | if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1)) |
13247 | return Res; |
13248 | } |
13249 | |
13250 | return SDValue(); |
13251 | } |
13252 | |
13253 | static bool isClampZeroToOne(SDValue A, SDValue B) { |
13254 | if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) { |
13255 | if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) { |
13256 | // FIXME: Should this be allowing -0.0? |
13257 | return (CA->isExactlyValue(V: 0.0) && CB->isExactlyValue(V: 1.0)) || |
13258 | (CA->isExactlyValue(V: 1.0) && CB->isExactlyValue(V: 0.0)); |
13259 | } |
13260 | } |
13261 | |
13262 | return false; |
13263 | } |
13264 | |
13265 | // FIXME: Should only worry about snans for version with chain. |
13266 | SDValue SITargetLowering::performFMed3Combine(SDNode *N, |
13267 | DAGCombinerInfo &DCI) const { |
13268 | EVT VT = N->getValueType(ResNo: 0); |
13269 | // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and |
13270 | // NaNs. With a NaN input, the order of the operands may change the result. |
13271 | |
13272 | SelectionDAG &DAG = DCI.DAG; |
13273 | SDLoc SL(N); |
13274 | |
13275 | SDValue Src0 = N->getOperand(Num: 0); |
13276 | SDValue Src1 = N->getOperand(Num: 1); |
13277 | SDValue Src2 = N->getOperand(Num: 2); |
13278 | |
13279 | if (isClampZeroToOne(A: Src0, B: Src1)) { |
13280 | // const_a, const_b, x -> clamp is safe in all cases including signaling |
13281 | // nans. |
13282 | // FIXME: Should this be allowing -0.0? |
13283 | return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2); |
13284 | } |
13285 | |
13286 | const MachineFunction &MF = DAG.getMachineFunction(); |
13287 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
13288 | |
13289 | // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother |
13290 | // handling no dx10-clamp? |
13291 | if (Info->getMode().DX10Clamp) { |
13292 | // If NaNs is clamped to 0, we are free to reorder the inputs. |
13293 | |
13294 | if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1)) |
13295 | std::swap(a&: Src0, b&: Src1); |
13296 | |
13297 | if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2)) |
13298 | std::swap(a&: Src1, b&: Src2); |
13299 | |
13300 | if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1)) |
13301 | std::swap(a&: Src0, b&: Src1); |
13302 | |
13303 | if (isClampZeroToOne(A: Src1, B: Src2)) |
13304 | return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0); |
13305 | } |
13306 | |
13307 | return SDValue(); |
13308 | } |
13309 | |
13310 | SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, |
13311 | DAGCombinerInfo &DCI) const { |
13312 | SDValue Src0 = N->getOperand(Num: 0); |
13313 | SDValue Src1 = N->getOperand(Num: 1); |
13314 | if (Src0.isUndef() && Src1.isUndef()) |
13315 | return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0)); |
13316 | return SDValue(); |
13317 | } |
13318 | |
13319 | // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be |
13320 | // expanded into a set of cmp/select instructions. |
13321 | bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, |
13322 | unsigned NumElem, |
13323 | bool IsDivergentIdx, |
13324 | const GCNSubtarget *Subtarget) { |
13325 | if (UseDivergentRegisterIndexing) |
13326 | return false; |
13327 | |
13328 | unsigned VecSize = EltSize * NumElem; |
13329 | |
13330 | // Sub-dword vectors of size 2 dword or less have better implementation. |
13331 | if (VecSize <= 64 && EltSize < 32) |
13332 | return false; |
13333 | |
13334 | // Always expand the rest of sub-dword instructions, otherwise it will be |
13335 | // lowered via memory. |
13336 | if (EltSize < 32) |
13337 | return true; |
13338 | |
13339 | // Always do this if var-idx is divergent, otherwise it will become a loop. |
13340 | if (IsDivergentIdx) |
13341 | return true; |
13342 | |
13343 | // Large vectors would yield too many compares and v_cndmask_b32 instructions. |
13344 | unsigned NumInsts = NumElem /* Number of compares */ + |
13345 | ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; |
13346 | |
13347 | // On some architectures (GFX9) movrel is not available and it's better |
13348 | // to expand. |
13349 | if (!Subtarget->hasMovrel()) |
13350 | return NumInsts <= 16; |
13351 | |
13352 | // If movrel is available, use it instead of expanding for vector of 8 |
13353 | // elements. |
13354 | return NumInsts <= 15; |
13355 | } |
13356 | |
13357 | bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { |
13358 | SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1); |
13359 | if (isa<ConstantSDNode>(Val: Idx)) |
13360 | return false; |
13361 | |
13362 | SDValue Vec = N->getOperand(Num: 0); |
13363 | EVT VecVT = Vec.getValueType(); |
13364 | EVT EltVT = VecVT.getVectorElementType(); |
13365 | unsigned EltSize = EltVT.getSizeInBits(); |
13366 | unsigned NumElem = VecVT.getVectorNumElements(); |
13367 | |
13368 | return SITargetLowering::shouldExpandVectorDynExt( |
13369 | EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget()); |
13370 | } |
13371 | |
13372 | SDValue SITargetLowering::( |
13373 | SDNode *N, DAGCombinerInfo &DCI) const { |
13374 | SDValue Vec = N->getOperand(Num: 0); |
13375 | SelectionDAG &DAG = DCI.DAG; |
13376 | |
13377 | EVT VecVT = Vec.getValueType(); |
13378 | EVT VecEltVT = VecVT.getVectorElementType(); |
13379 | EVT ResVT = N->getValueType(ResNo: 0); |
13380 | |
13381 | unsigned VecSize = VecVT.getSizeInBits(); |
13382 | unsigned VecEltSize = VecEltVT.getSizeInBits(); |
13383 | |
13384 | if ((Vec.getOpcode() == ISD::FNEG || |
13385 | Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { |
13386 | SDLoc SL(N); |
13387 | SDValue Idx = N->getOperand(Num: 1); |
13388 | SDValue Elt = |
13389 | DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx); |
13390 | return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt); |
13391 | } |
13392 | |
13393 | // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) |
13394 | // => |
13395 | // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) |
13396 | // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) |
13397 | // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt |
13398 | if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) { |
13399 | SDLoc SL(N); |
13400 | SDValue Idx = N->getOperand(Num: 1); |
13401 | unsigned Opc = Vec.getOpcode(); |
13402 | |
13403 | switch(Opc) { |
13404 | default: |
13405 | break; |
13406 | // TODO: Support other binary operations. |
13407 | case ISD::FADD: |
13408 | case ISD::FSUB: |
13409 | case ISD::FMUL: |
13410 | case ISD::ADD: |
13411 | case ISD::UMIN: |
13412 | case ISD::UMAX: |
13413 | case ISD::SMIN: |
13414 | case ISD::SMAX: |
13415 | case ISD::FMAXNUM: |
13416 | case ISD::FMINNUM: |
13417 | case ISD::FMAXNUM_IEEE: |
13418 | case ISD::FMINNUM_IEEE: |
13419 | case ISD::FMAXIMUM: |
13420 | case ISD::FMINIMUM: { |
13421 | SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, |
13422 | N1: Vec.getOperand(i: 0), N2: Idx); |
13423 | SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, |
13424 | N1: Vec.getOperand(i: 1), N2: Idx); |
13425 | |
13426 | DCI.AddToWorklist(N: Elt0.getNode()); |
13427 | DCI.AddToWorklist(N: Elt1.getNode()); |
13428 | return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags()); |
13429 | } |
13430 | } |
13431 | } |
13432 | |
13433 | // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) |
13434 | if (shouldExpandVectorDynExt(N)) { |
13435 | SDLoc SL(N); |
13436 | SDValue Idx = N->getOperand(Num: 1); |
13437 | SDValue V; |
13438 | for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { |
13439 | SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL); |
13440 | SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC); |
13441 | if (I == 0) |
13442 | V = Elt; |
13443 | else |
13444 | V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ); |
13445 | } |
13446 | return V; |
13447 | } |
13448 | |
13449 | if (!DCI.isBeforeLegalize()) |
13450 | return SDValue(); |
13451 | |
13452 | // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit |
13453 | // elements. This exposes more load reduction opportunities by replacing |
13454 | // multiple small extract_vector_elements with a single 32-bit extract. |
13455 | auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)); |
13456 | if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && |
13457 | VecSize > 32 && VecSize % 32 == 0 && Idx) { |
13458 | EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT); |
13459 | |
13460 | unsigned BitIndex = Idx->getZExtValue() * VecEltSize; |
13461 | unsigned EltIdx = BitIndex / 32; |
13462 | unsigned LeftoverBitIdx = BitIndex % 32; |
13463 | SDLoc SL(N); |
13464 | |
13465 | SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec); |
13466 | DCI.AddToWorklist(N: Cast.getNode()); |
13467 | |
13468 | SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast, |
13469 | N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32)); |
13470 | DCI.AddToWorklist(N: Elt.getNode()); |
13471 | SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt, |
13472 | N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32)); |
13473 | DCI.AddToWorklist(N: Srl.getNode()); |
13474 | |
13475 | EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger(); |
13476 | SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl); |
13477 | DCI.AddToWorklist(N: Trunc.getNode()); |
13478 | |
13479 | if (VecEltVT == ResVT) { |
13480 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc); |
13481 | } |
13482 | |
13483 | assert(ResVT.isScalarInteger()); |
13484 | return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT); |
13485 | } |
13486 | |
13487 | return SDValue(); |
13488 | } |
13489 | |
13490 | SDValue |
13491 | SITargetLowering::performInsertVectorEltCombine(SDNode *N, |
13492 | DAGCombinerInfo &DCI) const { |
13493 | SDValue Vec = N->getOperand(Num: 0); |
13494 | SDValue Idx = N->getOperand(Num: 2); |
13495 | EVT VecVT = Vec.getValueType(); |
13496 | EVT EltVT = VecVT.getVectorElementType(); |
13497 | |
13498 | // INSERT_VECTOR_ELT (<n x e>, var-idx) |
13499 | // => BUILD_VECTOR n x select (e, const-idx) |
13500 | if (!shouldExpandVectorDynExt(N)) |
13501 | return SDValue(); |
13502 | |
13503 | SelectionDAG &DAG = DCI.DAG; |
13504 | SDLoc SL(N); |
13505 | SDValue Ins = N->getOperand(Num: 1); |
13506 | EVT IdxVT = Idx.getValueType(); |
13507 | |
13508 | SmallVector<SDValue, 16> Ops; |
13509 | for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { |
13510 | SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT); |
13511 | SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC); |
13512 | SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ); |
13513 | Ops.push_back(Elt: V); |
13514 | } |
13515 | |
13516 | return DAG.getBuildVector(VT: VecVT, DL: SL, Ops); |
13517 | } |
13518 | |
13519 | /// Return the source of an fp_extend from f16 to f32, or a converted FP |
13520 | /// constant. |
13521 | static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) { |
13522 | if (Src.getOpcode() == ISD::FP_EXTEND && |
13523 | Src.getOperand(i: 0).getValueType() == MVT::f16) { |
13524 | return Src.getOperand(i: 0); |
13525 | } |
13526 | |
13527 | if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) { |
13528 | APFloat Val = CFP->getValueAPF(); |
13529 | bool LosesInfo = true; |
13530 | Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo); |
13531 | if (!LosesInfo) |
13532 | return DAG.getConstantFP(Val, DL: SDLoc(Src), VT: MVT::f16); |
13533 | } |
13534 | |
13535 | return SDValue(); |
13536 | } |
13537 | |
13538 | SDValue SITargetLowering::performFPRoundCombine(SDNode *N, |
13539 | DAGCombinerInfo &DCI) const { |
13540 | assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() && |
13541 | "combine only useful on gfx8" ); |
13542 | |
13543 | SDValue TruncSrc = N->getOperand(Num: 0); |
13544 | EVT VT = N->getValueType(ResNo: 0); |
13545 | if (VT != MVT::f16) |
13546 | return SDValue(); |
13547 | |
13548 | if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 || |
13549 | TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse()) |
13550 | return SDValue(); |
13551 | |
13552 | SelectionDAG &DAG = DCI.DAG; |
13553 | SDLoc SL(N); |
13554 | |
13555 | // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3, |
13556 | // and expanding it with min/max saves 1 instruction vs. casting to f32 and |
13557 | // casting back. |
13558 | |
13559 | // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) => |
13560 | // fmin(fmax(a, b), fmax(fmin(a, b), c)) |
13561 | SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0)); |
13562 | if (!A) |
13563 | return SDValue(); |
13564 | |
13565 | SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1)); |
13566 | if (!B) |
13567 | return SDValue(); |
13568 | |
13569 | SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2)); |
13570 | if (!C) |
13571 | return SDValue(); |
13572 | |
13573 | // This changes signaling nan behavior. If an input is a signaling nan, it |
13574 | // would have been quieted by the fpext originally. We don't care because |
13575 | // these are unconstrained ops. If we needed to insert quieting canonicalizes |
13576 | // we would be worse off than just doing the promotion. |
13577 | SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B); |
13578 | SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B); |
13579 | SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C); |
13580 | return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1); |
13581 | } |
13582 | |
13583 | unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, |
13584 | const SDNode *N0, |
13585 | const SDNode *N1) const { |
13586 | EVT VT = N0->getValueType(ResNo: 0); |
13587 | |
13588 | // Only do this if we are not trying to support denormals. v_mad_f32 does not |
13589 | // support denormals ever. |
13590 | if (((VT == MVT::f32 && |
13591 | denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) || |
13592 | (VT == MVT::f16 && Subtarget->hasMadF16() && |
13593 | denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) && |
13594 | isOperationLegal(Op: ISD::FMAD, VT)) |
13595 | return ISD::FMAD; |
13596 | |
13597 | const TargetOptions &Options = DAG.getTarget().Options; |
13598 | if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || |
13599 | (N0->getFlags().hasAllowContract() && |
13600 | N1->getFlags().hasAllowContract())) && |
13601 | isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) { |
13602 | return ISD::FMA; |
13603 | } |
13604 | |
13605 | return 0; |
13606 | } |
13607 | |
13608 | // For a reassociatable opcode perform: |
13609 | // op x, (op y, z) -> op (op x, z), y, if x and z are uniform |
13610 | SDValue SITargetLowering::reassociateScalarOps(SDNode *N, |
13611 | SelectionDAG &DAG) const { |
13612 | EVT VT = N->getValueType(ResNo: 0); |
13613 | if (VT != MVT::i32 && VT != MVT::i64) |
13614 | return SDValue(); |
13615 | |
13616 | if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0))) |
13617 | return SDValue(); |
13618 | |
13619 | unsigned Opc = N->getOpcode(); |
13620 | SDValue Op0 = N->getOperand(Num: 0); |
13621 | SDValue Op1 = N->getOperand(Num: 1); |
13622 | |
13623 | if (!(Op0->isDivergent() ^ Op1->isDivergent())) |
13624 | return SDValue(); |
13625 | |
13626 | if (Op0->isDivergent()) |
13627 | std::swap(a&: Op0, b&: Op1); |
13628 | |
13629 | if (Op1.getOpcode() != Opc || !Op1.hasOneUse()) |
13630 | return SDValue(); |
13631 | |
13632 | SDValue Op2 = Op1.getOperand(i: 1); |
13633 | Op1 = Op1.getOperand(i: 0); |
13634 | if (!(Op1->isDivergent() ^ Op2->isDivergent())) |
13635 | return SDValue(); |
13636 | |
13637 | if (Op1->isDivergent()) |
13638 | std::swap(a&: Op1, b&: Op2); |
13639 | |
13640 | SDLoc SL(N); |
13641 | SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1); |
13642 | return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2); |
13643 | } |
13644 | |
13645 | static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, |
13646 | EVT VT, |
13647 | SDValue N0, SDValue N1, SDValue N2, |
13648 | bool Signed) { |
13649 | unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32; |
13650 | SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1); |
13651 | SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2); |
13652 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad); |
13653 | } |
13654 | |
13655 | // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high |
13656 | // multiplies, if any. |
13657 | // |
13658 | // Full 64-bit multiplies that feed into an addition are lowered here instead |
13659 | // of using the generic expansion. The generic expansion ends up with |
13660 | // a tree of ADD nodes that prevents us from using the "add" part of the |
13661 | // MAD instruction. The expansion produced here results in a chain of ADDs |
13662 | // instead of a tree. |
13663 | SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, |
13664 | DAGCombinerInfo &DCI) const { |
13665 | assert(N->getOpcode() == ISD::ADD); |
13666 | |
13667 | SelectionDAG &DAG = DCI.DAG; |
13668 | EVT VT = N->getValueType(ResNo: 0); |
13669 | SDLoc SL(N); |
13670 | SDValue LHS = N->getOperand(Num: 0); |
13671 | SDValue RHS = N->getOperand(Num: 1); |
13672 | |
13673 | if (VT.isVector()) |
13674 | return SDValue(); |
13675 | |
13676 | // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall |
13677 | // result in scalar registers for uniform values. |
13678 | if (!N->isDivergent() && Subtarget->hasSMulHi()) |
13679 | return SDValue(); |
13680 | |
13681 | unsigned NumBits = VT.getScalarSizeInBits(); |
13682 | if (NumBits <= 32 || NumBits > 64) |
13683 | return SDValue(); |
13684 | |
13685 | if (LHS.getOpcode() != ISD::MUL) { |
13686 | assert(RHS.getOpcode() == ISD::MUL); |
13687 | std::swap(a&: LHS, b&: RHS); |
13688 | } |
13689 | |
13690 | // Avoid the fold if it would unduly increase the number of multiplies due to |
13691 | // multiple uses, except on hardware with full-rate multiply-add (which is |
13692 | // part of full-rate 64-bit ops). |
13693 | if (!Subtarget->hasFullRate64Ops()) { |
13694 | unsigned NumUsers = 0; |
13695 | for (SDNode *Use : LHS->uses()) { |
13696 | // There is a use that does not feed into addition, so the multiply can't |
13697 | // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. |
13698 | if (Use->getOpcode() != ISD::ADD) |
13699 | return SDValue(); |
13700 | |
13701 | // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer |
13702 | // MUL + 3xADD + 3xADDC over 3xMAD. |
13703 | ++NumUsers; |
13704 | if (NumUsers >= 3) |
13705 | return SDValue(); |
13706 | } |
13707 | } |
13708 | |
13709 | SDValue MulLHS = LHS.getOperand(i: 0); |
13710 | SDValue MulRHS = LHS.getOperand(i: 1); |
13711 | SDValue AddRHS = RHS; |
13712 | |
13713 | // Always check whether operands are small unsigned values, since that |
13714 | // knowledge is useful in more cases. Check for small signed values only if |
13715 | // doing so can unlock a shorter code sequence. |
13716 | bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32; |
13717 | bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32; |
13718 | |
13719 | bool MulSignedLo = false; |
13720 | if (!MulLHSUnsigned32 || !MulRHSUnsigned32) { |
13721 | MulSignedLo = numBitsSigned(Op: MulLHS, DAG) <= 32 && |
13722 | numBitsSigned(Op: MulRHS, DAG) <= 32; |
13723 | } |
13724 | |
13725 | // The operands and final result all have the same number of bits. If |
13726 | // operands need to be extended, they can be extended with garbage. The |
13727 | // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is |
13728 | // truncated away in the end. |
13729 | if (VT != MVT::i64) { |
13730 | MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS); |
13731 | MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS); |
13732 | AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS); |
13733 | } |
13734 | |
13735 | // The basic code generated is conceptually straightforward. Pseudo code: |
13736 | // |
13737 | // accum = mad_64_32 lhs.lo, rhs.lo, accum |
13738 | // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi |
13739 | // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi |
13740 | // |
13741 | // The second and third lines are optional, depending on whether the factors |
13742 | // are {sign,zero}-extended or not. |
13743 | // |
13744 | // The actual DAG is noisier than the pseudo code, but only due to |
13745 | // instructions that disassemble values into low and high parts, and |
13746 | // assemble the final result. |
13747 | SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32); |
13748 | |
13749 | auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS); |
13750 | auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS); |
13751 | SDValue Accum = |
13752 | getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo); |
13753 | |
13754 | if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) { |
13755 | SDValue AccumLo, AccumHi; |
13756 | std::tie(args&: AccumLo, args&: AccumHi) = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32); |
13757 | |
13758 | if (!MulLHSUnsigned32) { |
13759 | auto MulLHSHi = |
13760 | DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One); |
13761 | SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo); |
13762 | AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi); |
13763 | } |
13764 | |
13765 | if (!MulRHSUnsigned32) { |
13766 | auto MulRHSHi = |
13767 | DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One); |
13768 | SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi); |
13769 | AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi); |
13770 | } |
13771 | |
13772 | Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi}); |
13773 | Accum = DAG.getBitcast(VT: MVT::i64, V: Accum); |
13774 | } |
13775 | |
13776 | if (VT != MVT::i64) |
13777 | Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum); |
13778 | return Accum; |
13779 | } |
13780 | |
13781 | // Collect the ultimate src of each of the mul node's operands, and confirm |
13782 | // each operand is 8 bytes. |
13783 | static std::optional<ByteProvider<SDValue>> |
13784 | handleMulOperand(const SDValue &MulOperand) { |
13785 | auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0); |
13786 | if (!Byte0 || Byte0->isConstantZero()) { |
13787 | return std::nullopt; |
13788 | } |
13789 | auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0); |
13790 | if (Byte1 && !Byte1->isConstantZero()) { |
13791 | return std::nullopt; |
13792 | } |
13793 | return Byte0; |
13794 | } |
13795 | |
13796 | static unsigned addPermMasks(unsigned First, unsigned Second) { |
13797 | unsigned FirstCs = First & 0x0c0c0c0c; |
13798 | unsigned SecondCs = Second & 0x0c0c0c0c; |
13799 | unsigned FirstNoCs = First & ~0x0c0c0c0c; |
13800 | unsigned SecondNoCs = Second & ~0x0c0c0c0c; |
13801 | |
13802 | assert((FirstCs & 0xFF) | (SecondCs & 0xFF)); |
13803 | assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00)); |
13804 | assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000)); |
13805 | assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000)); |
13806 | |
13807 | return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); |
13808 | } |
13809 | |
13810 | struct DotSrc { |
13811 | SDValue SrcOp; |
13812 | int64_t PermMask; |
13813 | int64_t DWordOffset; |
13814 | }; |
13815 | |
13816 | static void placeSources(ByteProvider<SDValue> &Src0, |
13817 | ByteProvider<SDValue> &Src1, |
13818 | SmallVectorImpl<DotSrc> &Src0s, |
13819 | SmallVectorImpl<DotSrc> &Src1s, int Step) { |
13820 | |
13821 | assert(Src0.Src.has_value() && Src1.Src.has_value()); |
13822 | // Src0s and Src1s are empty, just place arbitrarily. |
13823 | if (Step == 0) { |
13824 | Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c, |
13825 | .DWordOffset: Src0.SrcOffset / 4}); |
13826 | Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c, |
13827 | .DWordOffset: Src1.SrcOffset / 4}); |
13828 | return; |
13829 | } |
13830 | |
13831 | for (int BPI = 0; BPI < 2; BPI++) { |
13832 | std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1}; |
13833 | if (BPI == 1) { |
13834 | BPP = {Src1, Src0}; |
13835 | } |
13836 | unsigned ZeroMask = 0x0c0c0c0c; |
13837 | unsigned FMask = 0xFF << (8 * (3 - Step)); |
13838 | |
13839 | unsigned FirstMask = |
13840 | (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); |
13841 | unsigned SecondMask = |
13842 | (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); |
13843 | // Attempt to find Src vector which contains our SDValue, if so, add our |
13844 | // perm mask to the existing one. If we are unable to find a match for the |
13845 | // first SDValue, attempt to find match for the second. |
13846 | int FirstGroup = -1; |
13847 | for (int I = 0; I < 2; I++) { |
13848 | SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s; |
13849 | auto MatchesFirst = [&BPP](DotSrc &IterElt) { |
13850 | return IterElt.SrcOp == *BPP.first.Src && |
13851 | (IterElt.DWordOffset == (BPP.first.SrcOffset / 4)); |
13852 | }; |
13853 | |
13854 | auto Match = llvm::find_if(Range&: Srcs, P: MatchesFirst); |
13855 | if (Match != Srcs.end()) { |
13856 | Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask); |
13857 | FirstGroup = I; |
13858 | break; |
13859 | } |
13860 | } |
13861 | if (FirstGroup != -1) { |
13862 | SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s; |
13863 | auto MatchesSecond = [&BPP](DotSrc &IterElt) { |
13864 | return IterElt.SrcOp == *BPP.second.Src && |
13865 | (IterElt.DWordOffset == (BPP.second.SrcOffset / 4)); |
13866 | }; |
13867 | auto Match = llvm::find_if(Range&: Srcs, P: MatchesSecond); |
13868 | if (Match != Srcs.end()) { |
13869 | Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask); |
13870 | } else |
13871 | Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4}); |
13872 | return; |
13873 | } |
13874 | } |
13875 | |
13876 | // If we have made it here, then we could not find a match in Src0s or Src1s |
13877 | // for either Src0 or Src1, so just place them arbitrarily. |
13878 | |
13879 | unsigned ZeroMask = 0x0c0c0c0c; |
13880 | unsigned FMask = 0xFF << (8 * (3 - Step)); |
13881 | |
13882 | Src0s.push_back( |
13883 | Elt: {.SrcOp: *Src0.Src, |
13884 | .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), |
13885 | .DWordOffset: Src1.SrcOffset / 4}); |
13886 | Src1s.push_back( |
13887 | Elt: {.SrcOp: *Src1.Src, |
13888 | .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), |
13889 | .DWordOffset: Src1.SrcOffset / 4}); |
13890 | |
13891 | return; |
13892 | } |
13893 | |
13894 | static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, |
13895 | SmallVectorImpl<DotSrc> &Srcs, bool IsSigned, |
13896 | bool IsAny) { |
13897 | |
13898 | // If we just have one source, just permute it accordingly. |
13899 | if (Srcs.size() == 1) { |
13900 | auto Elt = Srcs.begin(); |
13901 | auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset); |
13902 | |
13903 | // v_perm will produce the original value |
13904 | if (Elt->PermMask == 0x3020100) |
13905 | return EltOp; |
13906 | |
13907 | return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp, |
13908 | N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32)); |
13909 | } |
13910 | |
13911 | auto FirstElt = Srcs.begin(); |
13912 | auto SecondElt = std::next(x: FirstElt); |
13913 | |
13914 | SmallVector<SDValue, 2> Perms; |
13915 | |
13916 | // If we have multiple sources in the chain, combine them via perms (using |
13917 | // calculated perm mask) and Ors. |
13918 | while (true) { |
13919 | auto FirstMask = FirstElt->PermMask; |
13920 | auto SecondMask = SecondElt->PermMask; |
13921 | |
13922 | unsigned FirstCs = FirstMask & 0x0c0c0c0c; |
13923 | unsigned FirstPlusFour = FirstMask | 0x04040404; |
13924 | // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any |
13925 | // original 0x0C. |
13926 | FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; |
13927 | |
13928 | auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask); |
13929 | auto FirstVal = |
13930 | getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset); |
13931 | auto SecondVal = |
13932 | getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset); |
13933 | |
13934 | Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal, |
13935 | N2: SecondVal, |
13936 | N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32))); |
13937 | |
13938 | FirstElt = std::next(x: SecondElt); |
13939 | if (FirstElt == Srcs.end()) |
13940 | break; |
13941 | |
13942 | SecondElt = std::next(x: FirstElt); |
13943 | // If we only have a FirstElt, then just combine that into the cumulative |
13944 | // source node. |
13945 | if (SecondElt == Srcs.end()) { |
13946 | auto EltOp = |
13947 | getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset); |
13948 | |
13949 | Perms.push_back( |
13950 | Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp, |
13951 | N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32))); |
13952 | break; |
13953 | } |
13954 | } |
13955 | |
13956 | assert(Perms.size() == 1 || Perms.size() == 2); |
13957 | return Perms.size() == 2 |
13958 | ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms[0], N2: Perms[1]) |
13959 | : Perms[0]; |
13960 | } |
13961 | |
13962 | static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) { |
13963 | for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) { |
13964 | EntryMask = EntryMask >> ((4 - ChainLength) * 8); |
13965 | auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; |
13966 | EntryMask += ZeroMask; |
13967 | } |
13968 | } |
13969 | |
13970 | static bool isMul(const SDValue Op) { |
13971 | auto Opcode = Op.getOpcode(); |
13972 | |
13973 | return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 || |
13974 | Opcode == AMDGPUISD::MUL_I24); |
13975 | } |
13976 | |
13977 | static std::optional<bool> |
13978 | checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0, |
13979 | ByteProvider<SDValue> &Src1, const SDValue &S0Op, |
13980 | const SDValue &S1Op, const SelectionDAG &DAG) { |
13981 | // If we both ops are i8s (pre legalize-dag), then the signedness semantics |
13982 | // of the dot4 is irrelevant. |
13983 | if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8) |
13984 | return false; |
13985 | |
13986 | auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0); |
13987 | bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0; |
13988 | bool S0IsSigned = Known0.countMinLeadingOnes() > 0; |
13989 | auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0); |
13990 | bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0; |
13991 | bool S1IsSigned = Known1.countMinLeadingOnes() > 0; |
13992 | |
13993 | assert(!(S0IsUnsigned && S0IsSigned)); |
13994 | assert(!(S1IsUnsigned && S1IsSigned)); |
13995 | |
13996 | // There are 9 possible permutations of |
13997 | // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned} |
13998 | |
13999 | // In two permutations, the sign bits are known to be the same for both Ops, |
14000 | // so simply return Signed / Unsigned corresponding to the MSB |
14001 | |
14002 | if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned)) |
14003 | return S0IsSigned; |
14004 | |
14005 | // In another two permutations, the sign bits are known to be opposite. In |
14006 | // this case return std::nullopt to indicate a bad match. |
14007 | |
14008 | if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned)) |
14009 | return std::nullopt; |
14010 | |
14011 | // In the remaining five permutations, we don't know the value of the sign |
14012 | // bit for at least one Op. Since we have a valid ByteProvider, we know that |
14013 | // the upper bits must be extension bits. Thus, the only ways for the sign |
14014 | // bit to be unknown is if it was sign extended from unknown value, or if it |
14015 | // was any extended. In either case, it is correct to use the signed |
14016 | // version of the signedness semantics of dot4 |
14017 | |
14018 | // In two of such permutations, we known the sign bit is set for |
14019 | // one op, and the other is unknown. It is okay to used signed version of |
14020 | // dot4. |
14021 | if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) || |
14022 | ((S1IsSigned && !(S0IsSigned || S0IsUnsigned)))) |
14023 | return true; |
14024 | |
14025 | // In one such permutation, we don't know either of the sign bits. It is okay |
14026 | // to used the signed version of dot4. |
14027 | if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned))) |
14028 | return true; |
14029 | |
14030 | // In two of such permutations, we known the sign bit is unset for |
14031 | // one op, and the other is unknown. Return std::nullopt to indicate a |
14032 | // bad match. |
14033 | if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) || |
14034 | ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned)))) |
14035 | return std::nullopt; |
14036 | |
14037 | llvm_unreachable("Fully covered condition" ); |
14038 | } |
14039 | |
14040 | SDValue SITargetLowering::performAddCombine(SDNode *N, |
14041 | DAGCombinerInfo &DCI) const { |
14042 | SelectionDAG &DAG = DCI.DAG; |
14043 | EVT VT = N->getValueType(ResNo: 0); |
14044 | SDLoc SL(N); |
14045 | SDValue LHS = N->getOperand(Num: 0); |
14046 | SDValue RHS = N->getOperand(Num: 1); |
14047 | |
14048 | if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) { |
14049 | if (Subtarget->hasMad64_32()) { |
14050 | if (SDValue Folded = tryFoldToMad64_32(N, DCI)) |
14051 | return Folded; |
14052 | } |
14053 | } |
14054 | |
14055 | if (SDValue V = reassociateScalarOps(N, DAG)) { |
14056 | return V; |
14057 | } |
14058 | |
14059 | if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() && |
14060 | (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { |
14061 | SDValue TempNode(N, 0); |
14062 | std::optional<bool> IsSigned; |
14063 | SmallVector<DotSrc, 4> Src0s; |
14064 | SmallVector<DotSrc, 4> Src1s; |
14065 | SmallVector<SDValue, 4> Src2s; |
14066 | |
14067 | // Match the v_dot4 tree, while collecting src nodes. |
14068 | int ChainLength = 0; |
14069 | for (int I = 0; I < 4; I++) { |
14070 | auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1; |
14071 | if (MulIdx == -1) |
14072 | break; |
14073 | auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0)); |
14074 | if (!Src0) |
14075 | break; |
14076 | auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1)); |
14077 | if (!Src1) |
14078 | break; |
14079 | |
14080 | auto IterIsSigned = checkDot4MulSignedness( |
14081 | N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1, |
14082 | S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0), |
14083 | S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG); |
14084 | if (!IterIsSigned) |
14085 | break; |
14086 | if (!IsSigned) |
14087 | IsSigned = *IterIsSigned; |
14088 | if (*IterIsSigned != *IsSigned) |
14089 | break; |
14090 | placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I); |
14091 | auto AddIdx = 1 - MulIdx; |
14092 | // Allow the special case where add (add (mul24, 0), mul24) became -> |
14093 | // add (mul24, mul24). |
14094 | if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) { |
14095 | Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx)); |
14096 | auto Src0 = |
14097 | handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0)); |
14098 | if (!Src0) |
14099 | break; |
14100 | auto Src1 = |
14101 | handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1)); |
14102 | if (!Src1) |
14103 | break; |
14104 | auto IterIsSigned = checkDot4MulSignedness( |
14105 | N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1, |
14106 | S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0), |
14107 | S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG); |
14108 | if (!IterIsSigned) |
14109 | break; |
14110 | assert(IsSigned); |
14111 | if (*IterIsSigned != *IsSigned) |
14112 | break; |
14113 | placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1); |
14114 | Src2s.push_back(Elt: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)); |
14115 | ChainLength = I + 2; |
14116 | break; |
14117 | } |
14118 | |
14119 | TempNode = TempNode->getOperand(Num: AddIdx); |
14120 | Src2s.push_back(Elt: TempNode); |
14121 | ChainLength = I + 1; |
14122 | if (TempNode->getNumOperands() < 2) |
14123 | break; |
14124 | LHS = TempNode->getOperand(Num: 0); |
14125 | RHS = TempNode->getOperand(Num: 1); |
14126 | } |
14127 | |
14128 | if (ChainLength < 2) |
14129 | return SDValue(); |
14130 | |
14131 | // Masks were constructed with assumption that we would find a chain of |
14132 | // length 4. If not, then we need to 0 out the MSB bits (via perm mask of |
14133 | // 0x0c) so they do not affect dot calculation. |
14134 | if (ChainLength < 4) { |
14135 | fixMasks(Srcs&: Src0s, ChainLength); |
14136 | fixMasks(Srcs&: Src1s, ChainLength); |
14137 | } |
14138 | |
14139 | SDValue Src0, Src1; |
14140 | |
14141 | // If we are just using a single source for both, and have permuted the |
14142 | // bytes consistently, we can just use the sources without permuting |
14143 | // (commutation). |
14144 | bool UseOriginalSrc = false; |
14145 | if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && |
14146 | Src0s.begin()->PermMask == Src1s.begin()->PermMask && |
14147 | Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 && |
14148 | Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) { |
14149 | SmallVector<unsigned, 4> SrcBytes; |
14150 | auto Src0Mask = Src0s.begin()->PermMask; |
14151 | SrcBytes.push_back(Elt: Src0Mask & 0xFF000000); |
14152 | bool UniqueEntries = true; |
14153 | for (auto I = 1; I < 4; I++) { |
14154 | auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); |
14155 | |
14156 | if (is_contained(Range&: SrcBytes, Element: NextByte)) { |
14157 | UniqueEntries = false; |
14158 | break; |
14159 | } |
14160 | SrcBytes.push_back(Elt: NextByte); |
14161 | } |
14162 | |
14163 | if (UniqueEntries) { |
14164 | UseOriginalSrc = true; |
14165 | |
14166 | auto FirstElt = Src0s.begin(); |
14167 | auto FirstEltOp = |
14168 | getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset); |
14169 | |
14170 | auto SecondElt = Src1s.begin(); |
14171 | auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, |
14172 | DWordOffset: SecondElt->DWordOffset); |
14173 | |
14174 | Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL, |
14175 | VT: MVT::getIntegerVT(BitWidth: 32)); |
14176 | Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL, |
14177 | VT: MVT::getIntegerVT(BitWidth: 32)); |
14178 | } |
14179 | } |
14180 | |
14181 | if (!UseOriginalSrc) { |
14182 | Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true); |
14183 | Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true); |
14184 | } |
14185 | |
14186 | assert(IsSigned); |
14187 | SDValue Src2 = |
14188 | DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s[ChainLength - 1], DL: SL, VT: MVT::i32); |
14189 | |
14190 | SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4 |
14191 | : Intrinsic::amdgcn_udot4, |
14192 | DL: SL, VT: MVT::i64); |
14193 | |
14194 | assert(!VT.isVector()); |
14195 | auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0, |
14196 | N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1)); |
14197 | |
14198 | return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT); |
14199 | } |
14200 | |
14201 | if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) |
14202 | return SDValue(); |
14203 | |
14204 | // add x, zext (setcc) => uaddo_carry x, 0, setcc |
14205 | // add x, sext (setcc) => usubo_carry x, 0, setcc |
14206 | unsigned Opc = LHS.getOpcode(); |
14207 | if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || |
14208 | Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY) |
14209 | std::swap(a&: RHS, b&: LHS); |
14210 | |
14211 | Opc = RHS.getOpcode(); |
14212 | switch (Opc) { |
14213 | default: break; |
14214 | case ISD::ZERO_EXTEND: |
14215 | case ISD::SIGN_EXTEND: |
14216 | case ISD::ANY_EXTEND: { |
14217 | auto Cond = RHS.getOperand(i: 0); |
14218 | // If this won't be a real VOPC output, we would still need to insert an |
14219 | // extra instruction anyway. |
14220 | if (!isBoolSGPR(V: Cond)) |
14221 | break; |
14222 | SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1); |
14223 | SDValue Args[] = { LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond }; |
14224 | Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY; |
14225 | return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args); |
14226 | } |
14227 | case ISD::UADDO_CARRY: { |
14228 | // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc |
14229 | if (!isNullConstant(V: RHS.getOperand(i: 1))) |
14230 | break; |
14231 | SDValue Args[] = { LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2) }; |
14232 | return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args); |
14233 | } |
14234 | } |
14235 | return SDValue(); |
14236 | } |
14237 | |
14238 | SDValue SITargetLowering::performSubCombine(SDNode *N, |
14239 | DAGCombinerInfo &DCI) const { |
14240 | SelectionDAG &DAG = DCI.DAG; |
14241 | EVT VT = N->getValueType(ResNo: 0); |
14242 | |
14243 | if (VT != MVT::i32) |
14244 | return SDValue(); |
14245 | |
14246 | SDLoc SL(N); |
14247 | SDValue LHS = N->getOperand(Num: 0); |
14248 | SDValue RHS = N->getOperand(Num: 1); |
14249 | |
14250 | // sub x, zext (setcc) => usubo_carry x, 0, setcc |
14251 | // sub x, sext (setcc) => uaddo_carry x, 0, setcc |
14252 | unsigned Opc = RHS.getOpcode(); |
14253 | switch (Opc) { |
14254 | default: break; |
14255 | case ISD::ZERO_EXTEND: |
14256 | case ISD::SIGN_EXTEND: |
14257 | case ISD::ANY_EXTEND: { |
14258 | auto Cond = RHS.getOperand(i: 0); |
14259 | // If this won't be a real VOPC output, we would still need to insert an |
14260 | // extra instruction anyway. |
14261 | if (!isBoolSGPR(V: Cond)) |
14262 | break; |
14263 | SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1); |
14264 | SDValue Args[] = { LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond }; |
14265 | Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; |
14266 | return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args); |
14267 | } |
14268 | } |
14269 | |
14270 | if (LHS.getOpcode() == ISD::USUBO_CARRY) { |
14271 | // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc |
14272 | if (!isNullConstant(V: LHS.getOperand(i: 1))) |
14273 | return SDValue(); |
14274 | SDValue Args[] = { LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2) }; |
14275 | return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args); |
14276 | } |
14277 | return SDValue(); |
14278 | } |
14279 | |
14280 | SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, |
14281 | DAGCombinerInfo &DCI) const { |
14282 | |
14283 | if (N->getValueType(ResNo: 0) != MVT::i32) |
14284 | return SDValue(); |
14285 | |
14286 | if (!isNullConstant(V: N->getOperand(Num: 1))) |
14287 | return SDValue(); |
14288 | |
14289 | SelectionDAG &DAG = DCI.DAG; |
14290 | SDValue LHS = N->getOperand(Num: 0); |
14291 | |
14292 | // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc |
14293 | // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc |
14294 | unsigned LHSOpc = LHS.getOpcode(); |
14295 | unsigned Opc = N->getOpcode(); |
14296 | if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) || |
14297 | (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) { |
14298 | SDValue Args[] = { LHS.getOperand(i: 0), LHS.getOperand(i: 1), N->getOperand(Num: 2) }; |
14299 | return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), Ops: Args); |
14300 | } |
14301 | return SDValue(); |
14302 | } |
14303 | |
14304 | SDValue SITargetLowering::performFAddCombine(SDNode *N, |
14305 | DAGCombinerInfo &DCI) const { |
14306 | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
14307 | return SDValue(); |
14308 | |
14309 | SelectionDAG &DAG = DCI.DAG; |
14310 | EVT VT = N->getValueType(ResNo: 0); |
14311 | |
14312 | SDLoc SL(N); |
14313 | SDValue LHS = N->getOperand(Num: 0); |
14314 | SDValue RHS = N->getOperand(Num: 1); |
14315 | |
14316 | // These should really be instruction patterns, but writing patterns with |
14317 | // source modifiers is a pain. |
14318 | |
14319 | // fadd (fadd (a, a), b) -> mad 2.0, a, b |
14320 | if (LHS.getOpcode() == ISD::FADD) { |
14321 | SDValue A = LHS.getOperand(i: 0); |
14322 | if (A == LHS.getOperand(i: 1)) { |
14323 | unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode()); |
14324 | if (FusedOp != 0) { |
14325 | const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT); |
14326 | return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS); |
14327 | } |
14328 | } |
14329 | } |
14330 | |
14331 | // fadd (b, fadd (a, a)) -> mad 2.0, a, b |
14332 | if (RHS.getOpcode() == ISD::FADD) { |
14333 | SDValue A = RHS.getOperand(i: 0); |
14334 | if (A == RHS.getOperand(i: 1)) { |
14335 | unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode()); |
14336 | if (FusedOp != 0) { |
14337 | const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT); |
14338 | return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS); |
14339 | } |
14340 | } |
14341 | } |
14342 | |
14343 | return SDValue(); |
14344 | } |
14345 | |
14346 | SDValue SITargetLowering::performFSubCombine(SDNode *N, |
14347 | DAGCombinerInfo &DCI) const { |
14348 | if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) |
14349 | return SDValue(); |
14350 | |
14351 | SelectionDAG &DAG = DCI.DAG; |
14352 | SDLoc SL(N); |
14353 | EVT VT = N->getValueType(ResNo: 0); |
14354 | assert(!VT.isVector()); |
14355 | |
14356 | // Try to get the fneg to fold into the source modifier. This undoes generic |
14357 | // DAG combines and folds them into the mad. |
14358 | // |
14359 | // Only do this if we are not trying to support denormals. v_mad_f32 does |
14360 | // not support denormals ever. |
14361 | SDValue LHS = N->getOperand(Num: 0); |
14362 | SDValue RHS = N->getOperand(Num: 1); |
14363 | if (LHS.getOpcode() == ISD::FADD) { |
14364 | // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) |
14365 | SDValue A = LHS.getOperand(i: 0); |
14366 | if (A == LHS.getOperand(i: 1)) { |
14367 | unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode()); |
14368 | if (FusedOp != 0){ |
14369 | const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT); |
14370 | SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS); |
14371 | |
14372 | return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS); |
14373 | } |
14374 | } |
14375 | } |
14376 | |
14377 | if (RHS.getOpcode() == ISD::FADD) { |
14378 | // (fsub c, (fadd a, a)) -> mad -2.0, a, c |
14379 | |
14380 | SDValue A = RHS.getOperand(i: 0); |
14381 | if (A == RHS.getOperand(i: 1)) { |
14382 | unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode()); |
14383 | if (FusedOp != 0){ |
14384 | const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT); |
14385 | return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS); |
14386 | } |
14387 | } |
14388 | } |
14389 | |
14390 | return SDValue(); |
14391 | } |
14392 | |
14393 | SDValue SITargetLowering::performFDivCombine(SDNode *N, |
14394 | DAGCombinerInfo &DCI) const { |
14395 | SelectionDAG &DAG = DCI.DAG; |
14396 | SDLoc SL(N); |
14397 | EVT VT = N->getValueType(ResNo: 0); |
14398 | if (VT != MVT::f16 || !Subtarget->has16BitInsts()) |
14399 | return SDValue(); |
14400 | |
14401 | SDValue LHS = N->getOperand(Num: 0); |
14402 | SDValue RHS = N->getOperand(Num: 1); |
14403 | |
14404 | SDNodeFlags Flags = N->getFlags(); |
14405 | SDNodeFlags RHSFlags = RHS->getFlags(); |
14406 | if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || |
14407 | !RHS->hasOneUse()) |
14408 | return SDValue(); |
14409 | |
14410 | if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) { |
14411 | bool IsNegative = false; |
14412 | if (CLHS->isExactlyValue(V: 1.0) || |
14413 | (IsNegative = CLHS->isExactlyValue(V: -1.0))) { |
14414 | // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 |
14415 | // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 |
14416 | if (RHS.getOpcode() == ISD::FSQRT) { |
14417 | // TODO: Or in RHS flags, somehow missing from SDNodeFlags |
14418 | SDValue Rsq = |
14419 | DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: 0), Flags); |
14420 | return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq; |
14421 | } |
14422 | } |
14423 | } |
14424 | |
14425 | return SDValue(); |
14426 | } |
14427 | |
14428 | SDValue SITargetLowering::performFMACombine(SDNode *N, |
14429 | DAGCombinerInfo &DCI) const { |
14430 | SelectionDAG &DAG = DCI.DAG; |
14431 | EVT VT = N->getValueType(ResNo: 0); |
14432 | SDLoc SL(N); |
14433 | |
14434 | if (!Subtarget->hasDot7Insts() || VT != MVT::f32) |
14435 | return SDValue(); |
14436 | |
14437 | // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> |
14438 | // FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) |
14439 | SDValue Op1 = N->getOperand(Num: 0); |
14440 | SDValue Op2 = N->getOperand(Num: 1); |
14441 | SDValue FMA = N->getOperand(Num: 2); |
14442 | |
14443 | if (FMA.getOpcode() != ISD::FMA || |
14444 | Op1.getOpcode() != ISD::FP_EXTEND || |
14445 | Op2.getOpcode() != ISD::FP_EXTEND) |
14446 | return SDValue(); |
14447 | |
14448 | // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, |
14449 | // regardless of the denorm mode setting. Therefore, |
14450 | // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. |
14451 | const TargetOptions &Options = DAG.getTarget().Options; |
14452 | if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || |
14453 | (N->getFlags().hasAllowContract() && |
14454 | FMA->getFlags().hasAllowContract())) { |
14455 | Op1 = Op1.getOperand(i: 0); |
14456 | Op2 = Op2.getOperand(i: 0); |
14457 | if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
14458 | Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
14459 | return SDValue(); |
14460 | |
14461 | SDValue Vec1 = Op1.getOperand(i: 0); |
14462 | SDValue Idx1 = Op1.getOperand(i: 1); |
14463 | SDValue Vec2 = Op2.getOperand(i: 0); |
14464 | |
14465 | SDValue FMAOp1 = FMA.getOperand(i: 0); |
14466 | SDValue FMAOp2 = FMA.getOperand(i: 1); |
14467 | SDValue FMAAcc = FMA.getOperand(i: 2); |
14468 | |
14469 | if (FMAOp1.getOpcode() != ISD::FP_EXTEND || |
14470 | FMAOp2.getOpcode() != ISD::FP_EXTEND) |
14471 | return SDValue(); |
14472 | |
14473 | FMAOp1 = FMAOp1.getOperand(i: 0); |
14474 | FMAOp2 = FMAOp2.getOperand(i: 0); |
14475 | if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || |
14476 | FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
14477 | return SDValue(); |
14478 | |
14479 | SDValue Vec3 = FMAOp1.getOperand(i: 0); |
14480 | SDValue Vec4 = FMAOp2.getOperand(i: 0); |
14481 | SDValue Idx2 = FMAOp1.getOperand(i: 1); |
14482 | |
14483 | if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) || |
14484 | // Idx1 and Idx2 cannot be the same. |
14485 | Idx1 == Idx2) |
14486 | return SDValue(); |
14487 | |
14488 | if (Vec1 == Vec2 || Vec3 == Vec4) |
14489 | return SDValue(); |
14490 | |
14491 | if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) |
14492 | return SDValue(); |
14493 | |
14494 | if ((Vec1 == Vec3 && Vec2 == Vec4) || |
14495 | (Vec1 == Vec4 && Vec2 == Vec3)) { |
14496 | return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc, |
14497 | N4: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1)); |
14498 | } |
14499 | } |
14500 | return SDValue(); |
14501 | } |
14502 | |
14503 | SDValue SITargetLowering::performSetCCCombine(SDNode *N, |
14504 | DAGCombinerInfo &DCI) const { |
14505 | SelectionDAG &DAG = DCI.DAG; |
14506 | SDLoc SL(N); |
14507 | |
14508 | SDValue LHS = N->getOperand(Num: 0); |
14509 | SDValue RHS = N->getOperand(Num: 1); |
14510 | EVT VT = LHS.getValueType(); |
14511 | ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get(); |
14512 | |
14513 | auto CRHS = dyn_cast<ConstantSDNode>(Val&: RHS); |
14514 | if (!CRHS) { |
14515 | CRHS = dyn_cast<ConstantSDNode>(Val&: LHS); |
14516 | if (CRHS) { |
14517 | std::swap(a&: LHS, b&: RHS); |
14518 | CC = getSetCCSwappedOperands(Operation: CC); |
14519 | } |
14520 | } |
14521 | |
14522 | if (CRHS) { |
14523 | if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && |
14524 | isBoolSGPR(V: LHS.getOperand(i: 0))) { |
14525 | // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 |
14526 | // setcc (sext from i1 cc), -1, eq|sle|uge) => cc |
14527 | // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 |
14528 | // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc |
14529 | if ((CRHS->isAllOnes() && |
14530 | (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || |
14531 | (CRHS->isZero() && |
14532 | (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) |
14533 | return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0), |
14534 | N2: DAG.getConstant(Val: -1, DL: SL, VT: MVT::i1)); |
14535 | if ((CRHS->isAllOnes() && |
14536 | (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || |
14537 | (CRHS->isZero() && |
14538 | (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) |
14539 | return LHS.getOperand(i: 0); |
14540 | } |
14541 | |
14542 | const APInt &CRHSVal = CRHS->getAPIntValue(); |
14543 | if ((CC == ISD::SETEQ || CC == ISD::SETNE) && |
14544 | LHS.getOpcode() == ISD::SELECT && |
14545 | isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) && |
14546 | isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) && |
14547 | LHS.getConstantOperandVal(i: 1) != LHS.getConstantOperandVal(i: 2) && |
14548 | isBoolSGPR(V: LHS.getOperand(i: 0))) { |
14549 | // Given CT != FT: |
14550 | // setcc (select cc, CT, CF), CF, eq => xor cc, -1 |
14551 | // setcc (select cc, CT, CF), CF, ne => cc |
14552 | // setcc (select cc, CT, CF), CT, ne => xor cc, -1 |
14553 | // setcc (select cc, CT, CF), CT, eq => cc |
14554 | const APInt &CT = LHS.getConstantOperandAPInt(i: 1); |
14555 | const APInt &CF = LHS.getConstantOperandAPInt(i: 2); |
14556 | |
14557 | if ((CF == CRHSVal && CC == ISD::SETEQ) || |
14558 | (CT == CRHSVal && CC == ISD::SETNE)) |
14559 | return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0), |
14560 | N2: DAG.getConstant(Val: -1, DL: SL, VT: MVT::i1)); |
14561 | if ((CF == CRHSVal && CC == ISD::SETNE) || |
14562 | (CT == CRHSVal && CC == ISD::SETEQ)) |
14563 | return LHS.getOperand(i: 0); |
14564 | } |
14565 | } |
14566 | |
14567 | if (VT != MVT::f32 && VT != MVT::f64 && |
14568 | (!Subtarget->has16BitInsts() || VT != MVT::f16)) |
14569 | return SDValue(); |
14570 | |
14571 | // Match isinf/isfinite pattern |
14572 | // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) |
14573 | // (fcmp one (fabs x), inf) -> (fp_class x, |
14574 | // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero) |
14575 | if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) { |
14576 | const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS); |
14577 | if (!CRHS) |
14578 | return SDValue(); |
14579 | |
14580 | const APFloat &APF = CRHS->getValueAPF(); |
14581 | if (APF.isInfinity() && !APF.isNegative()) { |
14582 | const unsigned IsInfMask = SIInstrFlags::P_INFINITY | |
14583 | SIInstrFlags::N_INFINITY; |
14584 | const unsigned IsFiniteMask = SIInstrFlags::N_ZERO | |
14585 | SIInstrFlags::P_ZERO | |
14586 | SIInstrFlags::N_NORMAL | |
14587 | SIInstrFlags::P_NORMAL | |
14588 | SIInstrFlags::N_SUBNORMAL | |
14589 | SIInstrFlags::P_SUBNORMAL; |
14590 | unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask; |
14591 | return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0), |
14592 | N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32)); |
14593 | } |
14594 | } |
14595 | |
14596 | return SDValue(); |
14597 | } |
14598 | |
14599 | SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, |
14600 | DAGCombinerInfo &DCI) const { |
14601 | SelectionDAG &DAG = DCI.DAG; |
14602 | SDLoc SL(N); |
14603 | unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; |
14604 | |
14605 | SDValue Src = N->getOperand(Num: 0); |
14606 | SDValue Shift = N->getOperand(Num: 0); |
14607 | |
14608 | // TODO: Extend type shouldn't matter (assuming legal types). |
14609 | if (Shift.getOpcode() == ISD::ZERO_EXTEND) |
14610 | Shift = Shift.getOperand(i: 0); |
14611 | |
14612 | if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) { |
14613 | // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x |
14614 | // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x |
14615 | // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x |
14616 | // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x |
14617 | // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x |
14618 | if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) { |
14619 | SDValue Shifted = DAG.getZExtOrTrunc(Op: Shift.getOperand(i: 0), |
14620 | DL: SDLoc(Shift.getOperand(i: 0)), VT: MVT::i32); |
14621 | |
14622 | unsigned ShiftOffset = 8 * Offset; |
14623 | if (Shift.getOpcode() == ISD::SHL) |
14624 | ShiftOffset -= C->getZExtValue(); |
14625 | else |
14626 | ShiftOffset += C->getZExtValue(); |
14627 | |
14628 | if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { |
14629 | return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, DL: SL, |
14630 | VT: MVT::f32, Operand: Shifted); |
14631 | } |
14632 | } |
14633 | } |
14634 | |
14635 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
14636 | APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8); |
14637 | if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) { |
14638 | // We simplified Src. If this node is not dead, visit it again so it is |
14639 | // folded properly. |
14640 | if (N->getOpcode() != ISD::DELETED_NODE) |
14641 | DCI.AddToWorklist(N); |
14642 | return SDValue(N, 0); |
14643 | } |
14644 | |
14645 | // Handle (or x, (srl y, 8)) pattern when known bits are zero. |
14646 | if (SDValue DemandedSrc = |
14647 | TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG)) |
14648 | return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc); |
14649 | |
14650 | return SDValue(); |
14651 | } |
14652 | |
14653 | SDValue SITargetLowering::performClampCombine(SDNode *N, |
14654 | DAGCombinerInfo &DCI) const { |
14655 | ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0)); |
14656 | if (!CSrc) |
14657 | return SDValue(); |
14658 | |
14659 | const MachineFunction &MF = DCI.DAG.getMachineFunction(); |
14660 | const APFloat &F = CSrc->getValueAPF(); |
14661 | APFloat Zero = APFloat::getZero(Sem: F.getSemantics()); |
14662 | if (F < Zero || |
14663 | (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { |
14664 | return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0)); |
14665 | } |
14666 | |
14667 | APFloat One(F.getSemantics(), "1.0" ); |
14668 | if (F > One) |
14669 | return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0)); |
14670 | |
14671 | return SDValue(CSrc, 0); |
14672 | } |
14673 | |
14674 | |
14675 | SDValue SITargetLowering::PerformDAGCombine(SDNode *N, |
14676 | DAGCombinerInfo &DCI) const { |
14677 | if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) |
14678 | return SDValue(); |
14679 | switch (N->getOpcode()) { |
14680 | case ISD::ADD: |
14681 | return performAddCombine(N, DCI); |
14682 | case ISD::SUB: |
14683 | return performSubCombine(N, DCI); |
14684 | case ISD::UADDO_CARRY: |
14685 | case ISD::USUBO_CARRY: |
14686 | return performAddCarrySubCarryCombine(N, DCI); |
14687 | case ISD::FADD: |
14688 | return performFAddCombine(N, DCI); |
14689 | case ISD::FSUB: |
14690 | return performFSubCombine(N, DCI); |
14691 | case ISD::FDIV: |
14692 | return performFDivCombine(N, DCI); |
14693 | case ISD::SETCC: |
14694 | return performSetCCCombine(N, DCI); |
14695 | case ISD::FMAXNUM: |
14696 | case ISD::FMINNUM: |
14697 | case ISD::FMAXNUM_IEEE: |
14698 | case ISD::FMINNUM_IEEE: |
14699 | case ISD::FMAXIMUM: |
14700 | case ISD::FMINIMUM: |
14701 | case ISD::SMAX: |
14702 | case ISD::SMIN: |
14703 | case ISD::UMAX: |
14704 | case ISD::UMIN: |
14705 | case AMDGPUISD::FMIN_LEGACY: |
14706 | case AMDGPUISD::FMAX_LEGACY: |
14707 | return performMinMaxCombine(N, DCI); |
14708 | case ISD::FMA: |
14709 | return performFMACombine(N, DCI); |
14710 | case ISD::AND: |
14711 | return performAndCombine(N, DCI); |
14712 | case ISD::OR: |
14713 | return performOrCombine(N, DCI); |
14714 | case ISD::FSHR: { |
14715 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
14716 | if (N->getValueType(ResNo: 0) == MVT::i32 && N->isDivergent() && |
14717 | TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) { |
14718 | return matchPERM(N, DCI); |
14719 | } |
14720 | break; |
14721 | } |
14722 | case ISD::XOR: |
14723 | return performXorCombine(N, DCI); |
14724 | case ISD::ZERO_EXTEND: |
14725 | return performZeroExtendCombine(N, DCI); |
14726 | case ISD::SIGN_EXTEND_INREG: |
14727 | return performSignExtendInRegCombine(N , DCI); |
14728 | case AMDGPUISD::FP_CLASS: |
14729 | return performClassCombine(N, DCI); |
14730 | case ISD::FCANONICALIZE: |
14731 | return performFCanonicalizeCombine(N, DCI); |
14732 | case AMDGPUISD::RCP: |
14733 | return performRcpCombine(N, DCI); |
14734 | case ISD::FLDEXP: |
14735 | case AMDGPUISD::FRACT: |
14736 | case AMDGPUISD::RSQ: |
14737 | case AMDGPUISD::RCP_LEGACY: |
14738 | case AMDGPUISD::RCP_IFLAG: |
14739 | case AMDGPUISD::RSQ_CLAMP: { |
14740 | // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted |
14741 | SDValue Src = N->getOperand(Num: 0); |
14742 | if (Src.isUndef()) |
14743 | return Src; |
14744 | break; |
14745 | } |
14746 | case ISD::SINT_TO_FP: |
14747 | case ISD::UINT_TO_FP: |
14748 | return performUCharToFloatCombine(N, DCI); |
14749 | case ISD::FCOPYSIGN: |
14750 | return performFCopySignCombine(N, DCI); |
14751 | case AMDGPUISD::CVT_F32_UBYTE0: |
14752 | case AMDGPUISD::CVT_F32_UBYTE1: |
14753 | case AMDGPUISD::CVT_F32_UBYTE2: |
14754 | case AMDGPUISD::CVT_F32_UBYTE3: |
14755 | return performCvtF32UByteNCombine(N, DCI); |
14756 | case AMDGPUISD::FMED3: |
14757 | return performFMed3Combine(N, DCI); |
14758 | case AMDGPUISD::CVT_PKRTZ_F16_F32: |
14759 | return performCvtPkRTZCombine(N, DCI); |
14760 | case AMDGPUISD::CLAMP: |
14761 | return performClampCombine(N, DCI); |
14762 | case ISD::SCALAR_TO_VECTOR: { |
14763 | SelectionDAG &DAG = DCI.DAG; |
14764 | EVT VT = N->getValueType(ResNo: 0); |
14765 | |
14766 | // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) |
14767 | if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) { |
14768 | SDLoc SL(N); |
14769 | SDValue Src = N->getOperand(Num: 0); |
14770 | EVT EltVT = Src.getValueType(); |
14771 | if (EltVT != MVT::i16) |
14772 | Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src); |
14773 | |
14774 | SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src); |
14775 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext); |
14776 | } |
14777 | |
14778 | break; |
14779 | } |
14780 | case ISD::EXTRACT_VECTOR_ELT: |
14781 | return performExtractVectorEltCombine(N, DCI); |
14782 | case ISD::INSERT_VECTOR_ELT: |
14783 | return performInsertVectorEltCombine(N, DCI); |
14784 | case ISD::FP_ROUND: |
14785 | return performFPRoundCombine(N, DCI); |
14786 | case ISD::LOAD: { |
14787 | if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI)) |
14788 | return Widened; |
14789 | [[fallthrough]]; |
14790 | } |
14791 | default: { |
14792 | if (!DCI.isBeforeLegalize()) { |
14793 | if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N)) |
14794 | return performMemSDNodeCombine(N: MemNode, DCI); |
14795 | } |
14796 | |
14797 | break; |
14798 | } |
14799 | } |
14800 | |
14801 | return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); |
14802 | } |
14803 | |
14804 | /// Helper function for adjustWritemask |
14805 | static unsigned SubIdx2Lane(unsigned Idx) { |
14806 | switch (Idx) { |
14807 | default: return ~0u; |
14808 | case AMDGPU::sub0: return 0; |
14809 | case AMDGPU::sub1: return 1; |
14810 | case AMDGPU::sub2: return 2; |
14811 | case AMDGPU::sub3: return 3; |
14812 | case AMDGPU::sub4: return 4; // Possible with TFE/LWE |
14813 | } |
14814 | } |
14815 | |
14816 | /// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions |
14817 | SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, |
14818 | SelectionDAG &DAG) const { |
14819 | unsigned Opcode = Node->getMachineOpcode(); |
14820 | |
14821 | // Subtract 1 because the vdata output is not a MachineSDNode operand. |
14822 | int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::d16) - 1; |
14823 | if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx)) |
14824 | return Node; // not implemented for D16 |
14825 | |
14826 | SDNode *Users[5] = { nullptr }; |
14827 | unsigned Lane = 0; |
14828 | unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::dmask) - 1; |
14829 | unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx); |
14830 | unsigned NewDmask = 0; |
14831 | unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::tfe) - 1; |
14832 | unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::lwe) - 1; |
14833 | bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) || |
14834 | (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx))) |
14835 | ? true |
14836 | : false; |
14837 | unsigned TFCLane = 0; |
14838 | bool HasChain = Node->getNumValues() > 1; |
14839 | |
14840 | if (OldDmask == 0) { |
14841 | // These are folded out, but on the chance it happens don't assert. |
14842 | return Node; |
14843 | } |
14844 | |
14845 | unsigned OldBitsSet = llvm::popcount(Value: OldDmask); |
14846 | // Work out which is the TFE/LWE lane if that is enabled. |
14847 | if (UsesTFC) { |
14848 | TFCLane = OldBitsSet; |
14849 | } |
14850 | |
14851 | // Try to figure out the used register components |
14852 | for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); |
14853 | I != E; ++I) { |
14854 | |
14855 | // Don't look at users of the chain. |
14856 | if (I.getUse().getResNo() != 0) |
14857 | continue; |
14858 | |
14859 | // Abort if we can't understand the usage |
14860 | if (!I->isMachineOpcode() || |
14861 | I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) |
14862 | return Node; |
14863 | |
14864 | // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used. |
14865 | // Note that subregs are packed, i.e. Lane==0 is the first bit set |
14866 | // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit |
14867 | // set, etc. |
14868 | Lane = SubIdx2Lane(Idx: I->getConstantOperandVal(Num: 1)); |
14869 | if (Lane == ~0u) |
14870 | return Node; |
14871 | |
14872 | // Check if the use is for the TFE/LWE generated result at VGPRn+1. |
14873 | if (UsesTFC && Lane == TFCLane) { |
14874 | Users[Lane] = *I; |
14875 | } else { |
14876 | // Set which texture component corresponds to the lane. |
14877 | unsigned Comp; |
14878 | for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { |
14879 | Comp = llvm::countr_zero(Val: Dmask); |
14880 | Dmask &= ~(1 << Comp); |
14881 | } |
14882 | |
14883 | // Abort if we have more than one user per component. |
14884 | if (Users[Lane]) |
14885 | return Node; |
14886 | |
14887 | Users[Lane] = *I; |
14888 | NewDmask |= 1 << Comp; |
14889 | } |
14890 | } |
14891 | |
14892 | // Don't allow 0 dmask, as hardware assumes one channel enabled. |
14893 | bool NoChannels = !NewDmask; |
14894 | if (NoChannels) { |
14895 | if (!UsesTFC) { |
14896 | // No uses of the result and not using TFC. Then do nothing. |
14897 | return Node; |
14898 | } |
14899 | // If the original dmask has one channel - then nothing to do |
14900 | if (OldBitsSet == 1) |
14901 | return Node; |
14902 | // Use an arbitrary dmask - required for the instruction to work |
14903 | NewDmask = 1; |
14904 | } |
14905 | // Abort if there's no change |
14906 | if (NewDmask == OldDmask) |
14907 | return Node; |
14908 | |
14909 | unsigned BitsSet = llvm::popcount(Value: NewDmask); |
14910 | |
14911 | // Check for TFE or LWE - increase the number of channels by one to account |
14912 | // for the extra return value |
14913 | // This will need adjustment for D16 if this is also included in |
14914 | // adjustWriteMask (this function) but at present D16 are excluded. |
14915 | unsigned NewChannels = BitsSet + UsesTFC; |
14916 | |
14917 | int NewOpcode = |
14918 | AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels); |
14919 | assert(NewOpcode != -1 && |
14920 | NewOpcode != static_cast<int>(Node->getMachineOpcode()) && |
14921 | "failed to find equivalent MIMG op" ); |
14922 | |
14923 | // Adjust the writemask in the node |
14924 | SmallVector<SDValue, 12> Ops; |
14925 | Ops.insert(I: Ops.end(), From: Node->op_begin(), To: Node->op_begin() + DmaskIdx); |
14926 | Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc(Node), VT: MVT::i32)); |
14927 | Ops.insert(I: Ops.end(), From: Node->op_begin() + DmaskIdx + 1, To: Node->op_end()); |
14928 | |
14929 | MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT(); |
14930 | |
14931 | MVT ResultVT = NewChannels == 1 ? |
14932 | SVT : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4 : |
14933 | NewChannels == 5 ? 8 : NewChannels); |
14934 | SDVTList NewVTList = HasChain ? |
14935 | DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT); |
14936 | |
14937 | |
14938 | MachineSDNode *NewNode = DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node), |
14939 | VTs: NewVTList, Ops); |
14940 | |
14941 | if (HasChain) { |
14942 | // Update chain. |
14943 | DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands()); |
14944 | DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1)); |
14945 | } |
14946 | |
14947 | if (NewChannels == 1) { |
14948 | assert(Node->hasNUsesOfValue(1, 0)); |
14949 | SDNode *Copy = DAG.getMachineNode(Opcode: TargetOpcode::COPY, |
14950 | dl: SDLoc(Node), VT: Users[Lane]->getValueType(ResNo: 0), |
14951 | Op1: SDValue(NewNode, 0)); |
14952 | DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy); |
14953 | return nullptr; |
14954 | } |
14955 | |
14956 | // Update the users of the node with the new indices |
14957 | for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { |
14958 | SDNode *User = Users[i]; |
14959 | if (!User) { |
14960 | // Handle the special case of NoChannels. We set NewDmask to 1 above, but |
14961 | // Users[0] is still nullptr because channel 0 doesn't really have a use. |
14962 | if (i || !NoChannels) |
14963 | continue; |
14964 | } else { |
14965 | SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc(User), VT: MVT::i32); |
14966 | SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op); |
14967 | if (NewUser != User) { |
14968 | DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0)); |
14969 | DAG.RemoveDeadNode(N: User); |
14970 | } |
14971 | } |
14972 | |
14973 | switch (Idx) { |
14974 | default: break; |
14975 | case AMDGPU::sub0: Idx = AMDGPU::sub1; break; |
14976 | case AMDGPU::sub1: Idx = AMDGPU::sub2; break; |
14977 | case AMDGPU::sub2: Idx = AMDGPU::sub3; break; |
14978 | case AMDGPU::sub3: Idx = AMDGPU::sub4; break; |
14979 | } |
14980 | } |
14981 | |
14982 | DAG.RemoveDeadNode(N: Node); |
14983 | return nullptr; |
14984 | } |
14985 | |
14986 | static bool isFrameIndexOp(SDValue Op) { |
14987 | if (Op.getOpcode() == ISD::AssertZext) |
14988 | Op = Op.getOperand(i: 0); |
14989 | |
14990 | return isa<FrameIndexSDNode>(Val: Op); |
14991 | } |
14992 | |
14993 | /// Legalize target independent instructions (e.g. INSERT_SUBREG) |
14994 | /// with frame index operands. |
14995 | /// LLVM assumes that inputs are to these instructions are registers. |
14996 | SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, |
14997 | SelectionDAG &DAG) const { |
14998 | if (Node->getOpcode() == ISD::CopyToReg) { |
14999 | RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1)); |
15000 | SDValue SrcVal = Node->getOperand(Num: 2); |
15001 | |
15002 | // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have |
15003 | // to try understanding copies to physical registers. |
15004 | if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) { |
15005 | SDLoc SL(Node); |
15006 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
15007 | SDValue VReg = DAG.getRegister( |
15008 | Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1); |
15009 | |
15010 | SDNode *Glued = Node->getGluedNode(); |
15011 | SDValue ToVReg |
15012 | = DAG.getCopyToReg(Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal, |
15013 | Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); |
15014 | SDValue ToResultReg |
15015 | = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0), |
15016 | N: VReg, Glue: ToVReg.getValue(R: 1)); |
15017 | DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode()); |
15018 | DAG.RemoveDeadNode(N: Node); |
15019 | return ToResultReg.getNode(); |
15020 | } |
15021 | } |
15022 | |
15023 | SmallVector<SDValue, 8> Ops; |
15024 | for (unsigned i = 0; i < Node->getNumOperands(); ++i) { |
15025 | if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) { |
15026 | Ops.push_back(Elt: Node->getOperand(Num: i)); |
15027 | continue; |
15028 | } |
15029 | |
15030 | SDLoc DL(Node); |
15031 | Ops.push_back(Elt: SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, |
15032 | VT: Node->getOperand(Num: i).getValueType(), |
15033 | Op1: Node->getOperand(Num: i)), 0)); |
15034 | } |
15035 | |
15036 | return DAG.UpdateNodeOperands(N: Node, Ops); |
15037 | } |
15038 | |
15039 | /// Fold the instructions after selecting them. |
15040 | /// Returns null if users were already updated. |
15041 | SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, |
15042 | SelectionDAG &DAG) const { |
15043 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15044 | unsigned Opcode = Node->getMachineOpcode(); |
15045 | |
15046 | if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() && |
15047 | !TII->isGather4(Opcode) && |
15048 | AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) { |
15049 | return adjustWritemask(Node, DAG); |
15050 | } |
15051 | |
15052 | if (Opcode == AMDGPU::INSERT_SUBREG || |
15053 | Opcode == AMDGPU::REG_SEQUENCE) { |
15054 | legalizeTargetIndependentNode(Node, DAG); |
15055 | return Node; |
15056 | } |
15057 | |
15058 | switch (Opcode) { |
15059 | case AMDGPU::V_DIV_SCALE_F32_e64: |
15060 | case AMDGPU::V_DIV_SCALE_F64_e64: { |
15061 | // Satisfy the operand register constraint when one of the inputs is |
15062 | // undefined. Ordinarily each undef value will have its own implicit_def of |
15063 | // a vreg, so force these to use a single register. |
15064 | SDValue Src0 = Node->getOperand(Num: 1); |
15065 | SDValue Src1 = Node->getOperand(Num: 3); |
15066 | SDValue Src2 = Node->getOperand(Num: 5); |
15067 | |
15068 | if ((Src0.isMachineOpcode() && |
15069 | Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && |
15070 | (Src0 == Src1 || Src0 == Src2)) |
15071 | break; |
15072 | |
15073 | MVT VT = Src0.getValueType().getSimpleVT(); |
15074 | const TargetRegisterClass *RC = |
15075 | getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent()); |
15076 | |
15077 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
15078 | SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT); |
15079 | |
15080 | SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node), |
15081 | Reg: UndefReg, N: Src0, Glue: SDValue()); |
15082 | |
15083 | // src0 must be the same register as src1 or src2, even if the value is |
15084 | // undefined, so make sure we don't violate this constraint. |
15085 | if (Src0.isMachineOpcode() && |
15086 | Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) { |
15087 | if (Src1.isMachineOpcode() && |
15088 | Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) |
15089 | Src0 = Src1; |
15090 | else if (Src2.isMachineOpcode() && |
15091 | Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) |
15092 | Src0 = Src2; |
15093 | else { |
15094 | assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF); |
15095 | Src0 = UndefReg; |
15096 | Src1 = UndefReg; |
15097 | } |
15098 | } else |
15099 | break; |
15100 | |
15101 | SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end()); |
15102 | Ops[1] = Src0; |
15103 | Ops[3] = Src1; |
15104 | Ops[5] = Src2; |
15105 | Ops.push_back(Elt: ImpDef.getValue(R: 1)); |
15106 | return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops); |
15107 | } |
15108 | default: |
15109 | break; |
15110 | } |
15111 | |
15112 | return Node; |
15113 | } |
15114 | |
15115 | // Any MIMG instructions that use tfe or lwe require an initialization of the |
15116 | // result register that will be written in the case of a memory access failure. |
15117 | // The required code is also added to tie this init code to the result of the |
15118 | // img instruction. |
15119 | void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { |
15120 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15121 | const SIRegisterInfo &TRI = TII->getRegisterInfo(); |
15122 | MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); |
15123 | MachineBasicBlock &MBB = *MI.getParent(); |
15124 | |
15125 | int DstIdx = |
15126 | AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::vdata); |
15127 | unsigned InitIdx = 0; |
15128 | |
15129 | if (TII->isImage(MI)) { |
15130 | MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe); |
15131 | MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe); |
15132 | MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16); |
15133 | |
15134 | if (!TFE && !LWE) // intersect_ray |
15135 | return; |
15136 | |
15137 | unsigned TFEVal = TFE ? TFE->getImm() : 0; |
15138 | unsigned LWEVal = LWE ? LWE->getImm() : 0; |
15139 | unsigned D16Val = D16 ? D16->getImm() : 0; |
15140 | |
15141 | if (!TFEVal && !LWEVal) |
15142 | return; |
15143 | |
15144 | // At least one of TFE or LWE are non-zero |
15145 | // We have to insert a suitable initialization of the result value and |
15146 | // tie this to the dest of the image instruction. |
15147 | |
15148 | // Calculate which dword we have to initialize to 0. |
15149 | MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask); |
15150 | |
15151 | // check that dmask operand is found. |
15152 | assert(MO_Dmask && "Expected dmask operand in instruction" ); |
15153 | |
15154 | unsigned dmask = MO_Dmask->getImm(); |
15155 | // Determine the number of active lanes taking into account the |
15156 | // Gather4 special case |
15157 | unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask); |
15158 | |
15159 | bool Packed = !Subtarget->hasUnpackedD16VMem(); |
15160 | |
15161 | InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; |
15162 | |
15163 | // Abandon attempt if the dst size isn't large enough |
15164 | // - this is in fact an error but this is picked up elsewhere and |
15165 | // reported correctly. |
15166 | uint32_t DstSize = |
15167 | TRI.getRegSizeInBits(RC: *TII->getOpRegClass(MI, OpNo: DstIdx)) / 32; |
15168 | if (DstSize < InitIdx) |
15169 | return; |
15170 | } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) { |
15171 | InitIdx = TRI.getRegSizeInBits(RC: *TII->getOpRegClass(MI, OpNo: DstIdx)) / 32; |
15172 | } else { |
15173 | return; |
15174 | } |
15175 | |
15176 | const DebugLoc &DL = MI.getDebugLoc(); |
15177 | |
15178 | // Create a register for the initialization value. |
15179 | Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg()); |
15180 | unsigned NewDst = 0; // Final initialized value will be in here |
15181 | |
15182 | // If PRTStrictNull feature is enabled (the default) then initialize |
15183 | // all the result registers to 0, otherwise just the error indication |
15184 | // register (VGPRn+1) |
15185 | unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1; |
15186 | unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1); |
15187 | |
15188 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst); |
15189 | for (; SizeLeft; SizeLeft--, CurrIdx++) { |
15190 | NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx)); |
15191 | // Initialize dword |
15192 | Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass); |
15193 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg) |
15194 | .addImm(Val: 0); |
15195 | // Insert into the super-reg |
15196 | BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst) |
15197 | .addReg(RegNo: PrevDst) |
15198 | .addReg(RegNo: SubReg) |
15199 | .addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx)); |
15200 | |
15201 | PrevDst = NewDst; |
15202 | } |
15203 | |
15204 | // Add as an implicit operand |
15205 | MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true)); |
15206 | |
15207 | // Tie the just added implicit operand to the dst |
15208 | MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1); |
15209 | } |
15210 | |
15211 | /// Assign the register class depending on the number of |
15212 | /// bits set in the writemask |
15213 | void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, |
15214 | SDNode *Node) const { |
15215 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15216 | |
15217 | MachineFunction *MF = MI.getParent()->getParent(); |
15218 | MachineRegisterInfo &MRI = MF->getRegInfo(); |
15219 | SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
15220 | |
15221 | if (TII->isVOP3(Opcode: MI.getOpcode())) { |
15222 | // Make sure constant bus requirements are respected. |
15223 | TII->legalizeOperandsVOP3(MRI, MI); |
15224 | |
15225 | // Prefer VGPRs over AGPRs in mAI instructions where possible. |
15226 | // This saves a chain-copy of registers and better balance register |
15227 | // use between vgpr and agpr as agpr tuples tend to be big. |
15228 | if (!MI.getDesc().operands().empty()) { |
15229 | unsigned Opc = MI.getOpcode(); |
15230 | bool HasAGPRs = Info->mayNeedAGPRs(); |
15231 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
15232 | int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src2); |
15233 | for (auto I : |
15234 | {AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src0), |
15235 | AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src1), Src2Idx}) { |
15236 | if (I == -1) |
15237 | break; |
15238 | if ((I == Src2Idx) && (HasAGPRs)) |
15239 | break; |
15240 | MachineOperand &Op = MI.getOperand(i: I); |
15241 | if (!Op.isReg() || !Op.getReg().isVirtual()) |
15242 | continue; |
15243 | auto *RC = TRI->getRegClassForReg(MRI, Reg: Op.getReg()); |
15244 | if (!TRI->hasAGPRs(RC)) |
15245 | continue; |
15246 | auto *Src = MRI.getUniqueVRegDef(Reg: Op.getReg()); |
15247 | if (!Src || !Src->isCopy() || |
15248 | !TRI->isSGPRReg(MRI, Reg: Src->getOperand(i: 1).getReg())) |
15249 | continue; |
15250 | auto *NewRC = TRI->getEquivalentVGPRClass(SRC: RC); |
15251 | // All uses of agpr64 and agpr32 can also accept vgpr except for |
15252 | // v_accvgpr_read, but we do not produce agpr reads during selection, |
15253 | // so no use checks are needed. |
15254 | MRI.setRegClass(Reg: Op.getReg(), RC: NewRC); |
15255 | } |
15256 | |
15257 | if (!HasAGPRs) |
15258 | return; |
15259 | |
15260 | // Resolve the rest of AV operands to AGPRs. |
15261 | if (auto *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)) { |
15262 | if (Src2->isReg() && Src2->getReg().isVirtual()) { |
15263 | auto *RC = TRI->getRegClassForReg(MRI, Reg: Src2->getReg()); |
15264 | if (TRI->isVectorSuperClass(RC)) { |
15265 | auto *NewRC = TRI->getEquivalentAGPRClass(SRC: RC); |
15266 | MRI.setRegClass(Reg: Src2->getReg(), RC: NewRC); |
15267 | if (Src2->isTied()) |
15268 | MRI.setRegClass(Reg: MI.getOperand(i: 0).getReg(), RC: NewRC); |
15269 | } |
15270 | } |
15271 | } |
15272 | } |
15273 | |
15274 | return; |
15275 | } |
15276 | |
15277 | if (TII->isImage(MI)) |
15278 | TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr); |
15279 | } |
15280 | |
15281 | static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, |
15282 | uint64_t Val) { |
15283 | SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32); |
15284 | return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), 0); |
15285 | } |
15286 | |
15287 | MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, |
15288 | const SDLoc &DL, |
15289 | SDValue Ptr) const { |
15290 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15291 | |
15292 | // Build the half of the subregister with the constants before building the |
15293 | // full 128-bit register. If we are building multiple resource descriptors, |
15294 | // this will allow CSEing of the 2-component register. |
15295 | const SDValue Ops0[] = { |
15296 | DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32), |
15297 | buildSMovImm32(DAG, DL, Val: 0), |
15298 | DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32), |
15299 | buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> 32), |
15300 | DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32) |
15301 | }; |
15302 | |
15303 | SDValue SubRegHi = SDValue(DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, |
15304 | VT: MVT::v2i32, Ops: Ops0), 0); |
15305 | |
15306 | // Combine the constants and the pointer. |
15307 | const SDValue Ops1[] = { |
15308 | DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), |
15309 | Ptr, |
15310 | DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32), |
15311 | SubRegHi, |
15312 | DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32) |
15313 | }; |
15314 | |
15315 | return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1); |
15316 | } |
15317 | |
15318 | /// Return a resource descriptor with the 'Add TID' bit enabled |
15319 | /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] |
15320 | /// of the resource descriptor) to create an offset, which is added to |
15321 | /// the resource pointer. |
15322 | MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, |
15323 | SDValue Ptr, uint32_t RsrcDword1, |
15324 | uint64_t RsrcDword2And3) const { |
15325 | SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr); |
15326 | SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr); |
15327 | if (RsrcDword1) { |
15328 | PtrHi = SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi, |
15329 | Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)), |
15330 | 0); |
15331 | } |
15332 | |
15333 | SDValue DataLo = buildSMovImm32(DAG, DL, |
15334 | Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); |
15335 | SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32); |
15336 | |
15337 | const SDValue Ops[] = { |
15338 | DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), |
15339 | PtrLo, |
15340 | DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32), |
15341 | PtrHi, |
15342 | DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32), |
15343 | DataLo, |
15344 | DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32), |
15345 | DataHi, |
15346 | DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32) |
15347 | }; |
15348 | |
15349 | return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops); |
15350 | } |
15351 | |
15352 | //===----------------------------------------------------------------------===// |
15353 | // SI Inline Assembly Support |
15354 | //===----------------------------------------------------------------------===// |
15355 | |
15356 | std::pair<unsigned, const TargetRegisterClass *> |
15357 | SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, |
15358 | StringRef Constraint, |
15359 | MVT VT) const { |
15360 | const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_); |
15361 | |
15362 | const TargetRegisterClass *RC = nullptr; |
15363 | if (Constraint.size() == 1) { |
15364 | const unsigned BitWidth = VT.getSizeInBits(); |
15365 | switch (Constraint[0]) { |
15366 | default: |
15367 | return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); |
15368 | case 's': |
15369 | case 'r': |
15370 | switch (BitWidth) { |
15371 | case 16: |
15372 | RC = &AMDGPU::SReg_32RegClass; |
15373 | break; |
15374 | case 64: |
15375 | RC = &AMDGPU::SGPR_64RegClass; |
15376 | break; |
15377 | default: |
15378 | RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth); |
15379 | if (!RC) |
15380 | return std::pair(0U, nullptr); |
15381 | break; |
15382 | } |
15383 | break; |
15384 | case 'v': |
15385 | switch (BitWidth) { |
15386 | case 16: |
15387 | RC = &AMDGPU::VGPR_32RegClass; |
15388 | break; |
15389 | default: |
15390 | RC = TRI->getVGPRClassForBitWidth(BitWidth); |
15391 | if (!RC) |
15392 | return std::pair(0U, nullptr); |
15393 | break; |
15394 | } |
15395 | break; |
15396 | case 'a': |
15397 | if (!Subtarget->hasMAIInsts()) |
15398 | break; |
15399 | switch (BitWidth) { |
15400 | case 16: |
15401 | RC = &AMDGPU::AGPR_32RegClass; |
15402 | break; |
15403 | default: |
15404 | RC = TRI->getAGPRClassForBitWidth(BitWidth); |
15405 | if (!RC) |
15406 | return std::pair(0U, nullptr); |
15407 | break; |
15408 | } |
15409 | break; |
15410 | } |
15411 | // We actually support i128, i16 and f16 as inline parameters |
15412 | // even if they are not reported as legal |
15413 | if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || |
15414 | VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) |
15415 | return std::pair(0U, RC); |
15416 | } |
15417 | |
15418 | if (Constraint.starts_with(Prefix: "{" ) && Constraint.ends_with(Suffix: "}" )) { |
15419 | StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); |
15420 | if (RegName.consume_front(Prefix: "v" )) { |
15421 | RC = &AMDGPU::VGPR_32RegClass; |
15422 | } else if (RegName.consume_front(Prefix: "s" )) { |
15423 | RC = &AMDGPU::SGPR_32RegClass; |
15424 | } else if (RegName.consume_front(Prefix: "a" )) { |
15425 | RC = &AMDGPU::AGPR_32RegClass; |
15426 | } |
15427 | |
15428 | if (RC) { |
15429 | uint32_t Idx; |
15430 | if (RegName.consume_front(Prefix: "[" )) { |
15431 | uint32_t End; |
15432 | bool Failed = RegName.consumeInteger(Radix: 10, Result&: Idx); |
15433 | Failed |= !RegName.consume_front(Prefix: ":" ); |
15434 | Failed |= RegName.consumeInteger(Radix: 10, Result&: End); |
15435 | Failed |= !RegName.consume_back(Suffix: "]" ); |
15436 | if (!Failed) { |
15437 | uint32_t Width = (End - Idx + 1) * 32; |
15438 | MCRegister Reg = RC->getRegister(i: Idx); |
15439 | if (SIRegisterInfo::isVGPRClass(RC)) |
15440 | RC = TRI->getVGPRClassForBitWidth(BitWidth: Width); |
15441 | else if (SIRegisterInfo::isSGPRClass(RC)) |
15442 | RC = TRI->getSGPRClassForBitWidth(BitWidth: Width); |
15443 | else if (SIRegisterInfo::isAGPRClass(RC)) |
15444 | RC = TRI->getAGPRClassForBitWidth(BitWidth: Width); |
15445 | if (RC) { |
15446 | Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC); |
15447 | return std::pair(Reg, RC); |
15448 | } |
15449 | } |
15450 | } else { |
15451 | bool Failed = RegName.getAsInteger(Radix: 10, Result&: Idx); |
15452 | if (!Failed && Idx < RC->getNumRegs()) |
15453 | return std::pair(RC->getRegister(i: Idx), RC); |
15454 | } |
15455 | } |
15456 | } |
15457 | |
15458 | auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); |
15459 | if (Ret.first) |
15460 | Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first); |
15461 | |
15462 | return Ret; |
15463 | } |
15464 | |
15465 | static bool isImmConstraint(StringRef Constraint) { |
15466 | if (Constraint.size() == 1) { |
15467 | switch (Constraint[0]) { |
15468 | default: break; |
15469 | case 'I': |
15470 | case 'J': |
15471 | case 'A': |
15472 | case 'B': |
15473 | case 'C': |
15474 | return true; |
15475 | } |
15476 | } else if (Constraint == "DA" || |
15477 | Constraint == "DB" ) { |
15478 | return true; |
15479 | } |
15480 | return false; |
15481 | } |
15482 | |
15483 | SITargetLowering::ConstraintType |
15484 | SITargetLowering::getConstraintType(StringRef Constraint) const { |
15485 | if (Constraint.size() == 1) { |
15486 | switch (Constraint[0]) { |
15487 | default: break; |
15488 | case 's': |
15489 | case 'v': |
15490 | case 'a': |
15491 | return C_RegisterClass; |
15492 | } |
15493 | } |
15494 | if (isImmConstraint(Constraint)) { |
15495 | return C_Other; |
15496 | } |
15497 | return TargetLowering::getConstraintType(Constraint); |
15498 | } |
15499 | |
15500 | static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { |
15501 | if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) { |
15502 | Val = Val & maskTrailingOnes<uint64_t>(N: Size); |
15503 | } |
15504 | return Val; |
15505 | } |
15506 | |
15507 | void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, |
15508 | StringRef Constraint, |
15509 | std::vector<SDValue> &Ops, |
15510 | SelectionDAG &DAG) const { |
15511 | if (isImmConstraint(Constraint)) { |
15512 | uint64_t Val; |
15513 | if (getAsmOperandConstVal(Op, Val) && |
15514 | checkAsmConstraintVal(Op, Constraint, Val)) { |
15515 | Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits()); |
15516 | Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc(Op), VT: MVT::i64)); |
15517 | } |
15518 | } else { |
15519 | TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); |
15520 | } |
15521 | } |
15522 | |
15523 | bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { |
15524 | unsigned Size = Op.getScalarValueSizeInBits(); |
15525 | if (Size > 64) |
15526 | return false; |
15527 | |
15528 | if (Size == 16 && !Subtarget->has16BitInsts()) |
15529 | return false; |
15530 | |
15531 | if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) { |
15532 | Val = C->getSExtValue(); |
15533 | return true; |
15534 | } |
15535 | if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) { |
15536 | Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); |
15537 | return true; |
15538 | } |
15539 | if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) { |
15540 | if (Size != 16 || Op.getNumOperands() != 2) |
15541 | return false; |
15542 | if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef()) |
15543 | return false; |
15544 | if (ConstantSDNode *C = V->getConstantSplatNode()) { |
15545 | Val = C->getSExtValue(); |
15546 | return true; |
15547 | } |
15548 | if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) { |
15549 | Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); |
15550 | return true; |
15551 | } |
15552 | } |
15553 | |
15554 | return false; |
15555 | } |
15556 | |
15557 | bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, |
15558 | uint64_t Val) const { |
15559 | if (Constraint.size() == 1) { |
15560 | switch (Constraint[0]) { |
15561 | case 'I': |
15562 | return AMDGPU::isInlinableIntLiteral(Literal: Val); |
15563 | case 'J': |
15564 | return isInt<16>(x: Val); |
15565 | case 'A': |
15566 | return checkAsmConstraintValA(Op, Val); |
15567 | case 'B': |
15568 | return isInt<32>(x: Val); |
15569 | case 'C': |
15570 | return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) || |
15571 | AMDGPU::isInlinableIntLiteral(Literal: Val); |
15572 | default: |
15573 | break; |
15574 | } |
15575 | } else if (Constraint.size() == 2) { |
15576 | if (Constraint == "DA" ) { |
15577 | int64_t HiBits = static_cast<int32_t>(Val >> 32); |
15578 | int64_t LoBits = static_cast<int32_t>(Val); |
15579 | return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) && |
15580 | checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32); |
15581 | } |
15582 | if (Constraint == "DB" ) { |
15583 | return true; |
15584 | } |
15585 | } |
15586 | llvm_unreachable("Invalid asm constraint" ); |
15587 | } |
15588 | |
15589 | bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val, |
15590 | unsigned MaxSize) const { |
15591 | unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize); |
15592 | bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); |
15593 | if (Size == 16) { |
15594 | MVT VT = Op.getSimpleValueType(); |
15595 | switch (VT.SimpleTy) { |
15596 | default: |
15597 | return false; |
15598 | case MVT::i16: |
15599 | return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi); |
15600 | case MVT::f16: |
15601 | return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi); |
15602 | case MVT::bf16: |
15603 | return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi); |
15604 | case MVT::v2i16: |
15605 | return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value(); |
15606 | case MVT::v2f16: |
15607 | return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value(); |
15608 | case MVT::v2bf16: |
15609 | return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value(); |
15610 | } |
15611 | } |
15612 | if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) || |
15613 | (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi))) |
15614 | return true; |
15615 | return false; |
15616 | } |
15617 | |
15618 | static int getAlignedAGPRClassID(unsigned UnalignedClassID) { |
15619 | switch (UnalignedClassID) { |
15620 | case AMDGPU::VReg_64RegClassID: |
15621 | return AMDGPU::VReg_64_Align2RegClassID; |
15622 | case AMDGPU::VReg_96RegClassID: |
15623 | return AMDGPU::VReg_96_Align2RegClassID; |
15624 | case AMDGPU::VReg_128RegClassID: |
15625 | return AMDGPU::VReg_128_Align2RegClassID; |
15626 | case AMDGPU::VReg_160RegClassID: |
15627 | return AMDGPU::VReg_160_Align2RegClassID; |
15628 | case AMDGPU::VReg_192RegClassID: |
15629 | return AMDGPU::VReg_192_Align2RegClassID; |
15630 | case AMDGPU::VReg_224RegClassID: |
15631 | return AMDGPU::VReg_224_Align2RegClassID; |
15632 | case AMDGPU::VReg_256RegClassID: |
15633 | return AMDGPU::VReg_256_Align2RegClassID; |
15634 | case AMDGPU::VReg_288RegClassID: |
15635 | return AMDGPU::VReg_288_Align2RegClassID; |
15636 | case AMDGPU::VReg_320RegClassID: |
15637 | return AMDGPU::VReg_320_Align2RegClassID; |
15638 | case AMDGPU::VReg_352RegClassID: |
15639 | return AMDGPU::VReg_352_Align2RegClassID; |
15640 | case AMDGPU::VReg_384RegClassID: |
15641 | return AMDGPU::VReg_384_Align2RegClassID; |
15642 | case AMDGPU::VReg_512RegClassID: |
15643 | return AMDGPU::VReg_512_Align2RegClassID; |
15644 | case AMDGPU::VReg_1024RegClassID: |
15645 | return AMDGPU::VReg_1024_Align2RegClassID; |
15646 | case AMDGPU::AReg_64RegClassID: |
15647 | return AMDGPU::AReg_64_Align2RegClassID; |
15648 | case AMDGPU::AReg_96RegClassID: |
15649 | return AMDGPU::AReg_96_Align2RegClassID; |
15650 | case AMDGPU::AReg_128RegClassID: |
15651 | return AMDGPU::AReg_128_Align2RegClassID; |
15652 | case AMDGPU::AReg_160RegClassID: |
15653 | return AMDGPU::AReg_160_Align2RegClassID; |
15654 | case AMDGPU::AReg_192RegClassID: |
15655 | return AMDGPU::AReg_192_Align2RegClassID; |
15656 | case AMDGPU::AReg_256RegClassID: |
15657 | return AMDGPU::AReg_256_Align2RegClassID; |
15658 | case AMDGPU::AReg_512RegClassID: |
15659 | return AMDGPU::AReg_512_Align2RegClassID; |
15660 | case AMDGPU::AReg_1024RegClassID: |
15661 | return AMDGPU::AReg_1024_Align2RegClassID; |
15662 | default: |
15663 | return -1; |
15664 | } |
15665 | } |
15666 | |
15667 | // Figure out which registers should be reserved for stack access. Only after |
15668 | // the function is legalized do we know all of the non-spill stack objects or if |
15669 | // calls are present. |
15670 | void SITargetLowering::finalizeLowering(MachineFunction &MF) const { |
15671 | MachineRegisterInfo &MRI = MF.getRegInfo(); |
15672 | SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
15673 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
15674 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
15675 | const SIInstrInfo *TII = ST.getInstrInfo(); |
15676 | |
15677 | if (Info->isEntryFunction()) { |
15678 | // Callable functions have fixed registers used for stack access. |
15679 | reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info); |
15680 | } |
15681 | |
15682 | // TODO: Move this logic to getReservedRegs() |
15683 | // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. |
15684 | unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); |
15685 | Register SReg = ST.isWave32() |
15686 | ? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - 1) |
15687 | : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2, |
15688 | RC: &AMDGPU::SGPR_64RegClass); |
15689 | Info->setSGPRForEXECCopy(SReg); |
15690 | |
15691 | assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), |
15692 | Info->getStackPtrOffsetReg())); |
15693 | if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) |
15694 | MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg()); |
15695 | |
15696 | // We need to worry about replacing the default register with itself in case |
15697 | // of MIR testcases missing the MFI. |
15698 | if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) |
15699 | MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg()); |
15700 | |
15701 | if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) |
15702 | MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg()); |
15703 | |
15704 | Info->limitOccupancy(MF); |
15705 | |
15706 | if (ST.isWave32() && !MF.empty()) { |
15707 | for (auto &MBB : MF) { |
15708 | for (auto &MI : MBB) { |
15709 | TII->fixImplicitOperands(MI); |
15710 | } |
15711 | } |
15712 | } |
15713 | |
15714 | // FIXME: This is a hack to fixup AGPR classes to use the properly aligned |
15715 | // classes if required. Ideally the register class constraints would differ |
15716 | // per-subtarget, but there's no easy way to achieve that right now. This is |
15717 | // not a problem for VGPRs because the correctly aligned VGPR class is implied |
15718 | // from using them as the register class for legal types. |
15719 | if (ST.needsAlignedVGPRs()) { |
15720 | for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { |
15721 | const Register Reg = Register::index2VirtReg(Index: I); |
15722 | const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); |
15723 | if (!RC) |
15724 | continue; |
15725 | int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID()); |
15726 | if (NewClassID != -1) |
15727 | MRI.setRegClass(Reg, RC: TRI->getRegClass(RCID: NewClassID)); |
15728 | } |
15729 | } |
15730 | |
15731 | TargetLoweringBase::finalizeLowering(MF); |
15732 | } |
15733 | |
15734 | void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op, |
15735 | KnownBits &Known, |
15736 | const APInt &DemandedElts, |
15737 | const SelectionDAG &DAG, |
15738 | unsigned Depth) const { |
15739 | Known.resetAll(); |
15740 | unsigned Opc = Op.getOpcode(); |
15741 | switch (Opc) { |
15742 | case ISD::INTRINSIC_WO_CHAIN: { |
15743 | unsigned IID = Op.getConstantOperandVal(i: 0); |
15744 | switch (IID) { |
15745 | case Intrinsic::amdgcn_mbcnt_lo: |
15746 | case Intrinsic::amdgcn_mbcnt_hi: { |
15747 | const GCNSubtarget &ST = |
15748 | DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); |
15749 | // These return at most the (wavefront size - 1) + src1 |
15750 | // As long as src1 is an immediate we can calc known bits |
15751 | KnownBits Src1Known = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1); |
15752 | unsigned Src1ValBits = Src1Known.countMaxActiveBits(); |
15753 | unsigned MaxActiveBits = std::max(a: Src1ValBits, b: ST.getWavefrontSizeLog2()); |
15754 | // Cater for potential carry |
15755 | MaxActiveBits += Src1ValBits ? 1 : 0; |
15756 | unsigned Size = Op.getValueType().getSizeInBits(); |
15757 | if (MaxActiveBits < Size) |
15758 | Known.Zero.setHighBits(Size - MaxActiveBits); |
15759 | return; |
15760 | } |
15761 | } |
15762 | break; |
15763 | } |
15764 | } |
15765 | return AMDGPUTargetLowering::computeKnownBitsForTargetNode( |
15766 | Op, Known, DemandedElts, DAG, Depth); |
15767 | } |
15768 | |
15769 | void SITargetLowering::computeKnownBitsForFrameIndex( |
15770 | const int FI, KnownBits &Known, const MachineFunction &MF) const { |
15771 | TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF); |
15772 | |
15773 | // Set the high bits to zero based on the maximum allowed scratch size per |
15774 | // wave. We can't use vaddr in MUBUF instructions if we don't know the address |
15775 | // calculation won't overflow, so assume the sign bit is never set. |
15776 | Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); |
15777 | } |
15778 | |
15779 | static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, |
15780 | KnownBits &Known, unsigned Dim) { |
15781 | unsigned MaxValue = |
15782 | ST.getMaxWorkitemID(Kernel: KB.getMachineFunction().getFunction(), Dimension: Dim); |
15783 | Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue)); |
15784 | } |
15785 | |
15786 | void SITargetLowering::computeKnownBitsForTargetInstr( |
15787 | GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts, |
15788 | const MachineRegisterInfo &MRI, unsigned Depth) const { |
15789 | const MachineInstr *MI = MRI.getVRegDef(Reg: R); |
15790 | switch (MI->getOpcode()) { |
15791 | case AMDGPU::G_INTRINSIC: |
15792 | case AMDGPU::G_INTRINSIC_CONVERGENT: { |
15793 | switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) { |
15794 | case Intrinsic::amdgcn_workitem_id_x: |
15795 | knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 0); |
15796 | break; |
15797 | case Intrinsic::amdgcn_workitem_id_y: |
15798 | knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 1); |
15799 | break; |
15800 | case Intrinsic::amdgcn_workitem_id_z: |
15801 | knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 2); |
15802 | break; |
15803 | case Intrinsic::amdgcn_mbcnt_lo: |
15804 | case Intrinsic::amdgcn_mbcnt_hi: { |
15805 | // These return at most the wavefront size - 1. |
15806 | unsigned Size = MRI.getType(Reg: R).getSizeInBits(); |
15807 | Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2()); |
15808 | break; |
15809 | } |
15810 | case Intrinsic::amdgcn_groupstaticsize: { |
15811 | // We can report everything over the maximum size as 0. We can't report |
15812 | // based on the actual size because we don't know if it's accurate or not |
15813 | // at any given point. |
15814 | Known.Zero.setHighBits( |
15815 | llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize())); |
15816 | break; |
15817 | } |
15818 | } |
15819 | break; |
15820 | } |
15821 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: |
15822 | Known.Zero.setHighBits(24); |
15823 | break; |
15824 | case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: |
15825 | Known.Zero.setHighBits(16); |
15826 | break; |
15827 | case AMDGPU::G_AMDGPU_SMED3: |
15828 | case AMDGPU::G_AMDGPU_UMED3: { |
15829 | auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); |
15830 | |
15831 | KnownBits Known2; |
15832 | KB.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1); |
15833 | if (Known2.isUnknown()) |
15834 | break; |
15835 | |
15836 | KnownBits Known1; |
15837 | KB.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1); |
15838 | if (Known1.isUnknown()) |
15839 | break; |
15840 | |
15841 | KnownBits Known0; |
15842 | KB.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1); |
15843 | if (Known0.isUnknown()) |
15844 | break; |
15845 | |
15846 | // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. |
15847 | Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; |
15848 | Known.One = Known0.One & Known1.One & Known2.One; |
15849 | break; |
15850 | } |
15851 | } |
15852 | } |
15853 | |
15854 | Align SITargetLowering::computeKnownAlignForTargetInstr( |
15855 | GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, |
15856 | unsigned Depth) const { |
15857 | const MachineInstr *MI = MRI.getVRegDef(Reg: R); |
15858 | if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) { |
15859 | // FIXME: Can this move to generic code? What about the case where the call |
15860 | // site specifies a lower alignment? |
15861 | Intrinsic::ID IID = GI->getIntrinsicID(); |
15862 | LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext(); |
15863 | AttributeList Attrs = Intrinsic::getAttributes(C&: Ctx, id: IID); |
15864 | if (MaybeAlign RetAlign = Attrs.getRetAlignment()) |
15865 | return *RetAlign; |
15866 | } |
15867 | return Align(1); |
15868 | } |
15869 | |
15870 | Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { |
15871 | const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); |
15872 | const Align CacheLineAlign = Align(64); |
15873 | |
15874 | // Pre-GFX10 target did not benefit from loop alignment |
15875 | if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || |
15876 | getSubtarget()->hasInstFwdPrefetchBug()) |
15877 | return PrefAlign; |
15878 | |
15879 | // On GFX10 I$ is 4 x 64 bytes cache lines. |
15880 | // By default prefetcher keeps one cache line behind and reads two ahead. |
15881 | // We can modify it with S_INST_PREFETCH for larger loops to have two lines |
15882 | // behind and one ahead. |
15883 | // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. |
15884 | // If loop fits 64 bytes it always spans no more than two cache lines and |
15885 | // does not need an alignment. |
15886 | // Else if loop is less or equal 128 bytes we do not need to modify prefetch, |
15887 | // Else if loop is less or equal 192 bytes we need two lines behind. |
15888 | |
15889 | const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); |
15890 | const MachineBasicBlock * = ML->getHeader(); |
15891 | if (Header->getAlignment() != PrefAlign) |
15892 | return Header->getAlignment(); // Already processed. |
15893 | |
15894 | unsigned LoopSize = 0; |
15895 | for (const MachineBasicBlock *MBB : ML->blocks()) { |
15896 | // If inner loop block is aligned assume in average half of the alignment |
15897 | // size to be added as nops. |
15898 | if (MBB != Header) |
15899 | LoopSize += MBB->getAlignment().value() / 2; |
15900 | |
15901 | for (const MachineInstr &MI : *MBB) { |
15902 | LoopSize += TII->getInstSizeInBytes(MI); |
15903 | if (LoopSize > 192) |
15904 | return PrefAlign; |
15905 | } |
15906 | } |
15907 | |
15908 | if (LoopSize <= 64) |
15909 | return PrefAlign; |
15910 | |
15911 | if (LoopSize <= 128) |
15912 | return CacheLineAlign; |
15913 | |
15914 | // If any of parent loops is surrounded by prefetch instructions do not |
15915 | // insert new for inner loop, which would reset parent's settings. |
15916 | for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { |
15917 | if (MachineBasicBlock *Exit = P->getExitBlock()) { |
15918 | auto I = Exit->getFirstNonDebugInstr(); |
15919 | if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) |
15920 | return CacheLineAlign; |
15921 | } |
15922 | } |
15923 | |
15924 | MachineBasicBlock *Pre = ML->getLoopPreheader(); |
15925 | MachineBasicBlock *Exit = ML->getExitBlock(); |
15926 | |
15927 | if (Pre && Exit) { |
15928 | auto PreTerm = Pre->getFirstTerminator(); |
15929 | if (PreTerm == Pre->begin() || |
15930 | std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH) |
15931 | BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH)) |
15932 | .addImm(Val: 1); // prefetch 2 lines behind PC |
15933 | |
15934 | auto ExitHead = Exit->getFirstNonDebugInstr(); |
15935 | if (ExitHead == Exit->end() || |
15936 | ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH) |
15937 | BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH)) |
15938 | .addImm(Val: 2); // prefetch 1 line behind PC |
15939 | } |
15940 | |
15941 | return CacheLineAlign; |
15942 | } |
15943 | |
15944 | LLVM_ATTRIBUTE_UNUSED |
15945 | static bool isCopyFromRegOfInlineAsm(const SDNode *N) { |
15946 | assert(N->getOpcode() == ISD::CopyFromReg); |
15947 | do { |
15948 | // Follow the chain until we find an INLINEASM node. |
15949 | N = N->getOperand(Num: 0).getNode(); |
15950 | if (N->getOpcode() == ISD::INLINEASM || |
15951 | N->getOpcode() == ISD::INLINEASM_BR) |
15952 | return true; |
15953 | } while (N->getOpcode() == ISD::CopyFromReg); |
15954 | return false; |
15955 | } |
15956 | |
15957 | bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, |
15958 | FunctionLoweringInfo *FLI, |
15959 | UniformityInfo *UA) const { |
15960 | switch (N->getOpcode()) { |
15961 | case ISD::CopyFromReg: { |
15962 | const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1)); |
15963 | const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); |
15964 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
15965 | Register Reg = R->getReg(); |
15966 | |
15967 | // FIXME: Why does this need to consider isLiveIn? |
15968 | if (Reg.isPhysical() || MRI.isLiveIn(Reg)) |
15969 | return !TRI->isSGPRReg(MRI, Reg); |
15970 | |
15971 | if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg())) |
15972 | return UA->isDivergent(V); |
15973 | |
15974 | assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); |
15975 | return !TRI->isSGPRReg(MRI, Reg); |
15976 | } |
15977 | case ISD::LOAD: { |
15978 | const LoadSDNode *L = cast<LoadSDNode>(Val: N); |
15979 | unsigned AS = L->getAddressSpace(); |
15980 | // A flat load may access private memory. |
15981 | return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; |
15982 | } |
15983 | case ISD::CALLSEQ_END: |
15984 | return true; |
15985 | case ISD::INTRINSIC_WO_CHAIN: |
15986 | return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0)); |
15987 | case ISD::INTRINSIC_W_CHAIN: |
15988 | return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1)); |
15989 | case AMDGPUISD::ATOMIC_CMP_SWAP: |
15990 | case AMDGPUISD::BUFFER_ATOMIC_SWAP: |
15991 | case AMDGPUISD::BUFFER_ATOMIC_ADD: |
15992 | case AMDGPUISD::BUFFER_ATOMIC_SUB: |
15993 | case AMDGPUISD::BUFFER_ATOMIC_SMIN: |
15994 | case AMDGPUISD::BUFFER_ATOMIC_UMIN: |
15995 | case AMDGPUISD::BUFFER_ATOMIC_SMAX: |
15996 | case AMDGPUISD::BUFFER_ATOMIC_UMAX: |
15997 | case AMDGPUISD::BUFFER_ATOMIC_AND: |
15998 | case AMDGPUISD::BUFFER_ATOMIC_OR: |
15999 | case AMDGPUISD::BUFFER_ATOMIC_XOR: |
16000 | case AMDGPUISD::BUFFER_ATOMIC_INC: |
16001 | case AMDGPUISD::BUFFER_ATOMIC_DEC: |
16002 | case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: |
16003 | case AMDGPUISD::BUFFER_ATOMIC_CSUB: |
16004 | case AMDGPUISD::BUFFER_ATOMIC_FADD: |
16005 | case AMDGPUISD::BUFFER_ATOMIC_FMIN: |
16006 | case AMDGPUISD::BUFFER_ATOMIC_FMAX: |
16007 | // Target-specific read-modify-write atomics are sources of divergence. |
16008 | return true; |
16009 | default: |
16010 | if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) { |
16011 | // Generic read-modify-write atomics are sources of divergence. |
16012 | return A->readMem() && A->writeMem(); |
16013 | } |
16014 | return false; |
16015 | } |
16016 | } |
16017 | |
16018 | bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, |
16019 | EVT VT) const { |
16020 | switch (VT.getScalarType().getSimpleVT().SimpleTy) { |
16021 | case MVT::f32: |
16022 | return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction()); |
16023 | case MVT::f64: |
16024 | case MVT::f16: |
16025 | return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()); |
16026 | default: |
16027 | return false; |
16028 | } |
16029 | } |
16030 | |
16031 | bool SITargetLowering::denormalsEnabledForType( |
16032 | LLT Ty, const MachineFunction &MF) const { |
16033 | switch (Ty.getScalarSizeInBits()) { |
16034 | case 32: |
16035 | return !denormalModeIsFlushAllF32(MF); |
16036 | case 64: |
16037 | case 16: |
16038 | return !denormalModeIsFlushAllF64F16(MF); |
16039 | default: |
16040 | return false; |
16041 | } |
16042 | } |
16043 | |
16044 | bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, |
16045 | const SelectionDAG &DAG, |
16046 | bool SNaN, |
16047 | unsigned Depth) const { |
16048 | if (Op.getOpcode() == AMDGPUISD::CLAMP) { |
16049 | const MachineFunction &MF = DAG.getMachineFunction(); |
16050 | const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); |
16051 | |
16052 | if (Info->getMode().DX10Clamp) |
16053 | return true; // Clamped to 0. |
16054 | return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1); |
16055 | } |
16056 | |
16057 | return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, |
16058 | SNaN, Depth); |
16059 | } |
16060 | |
16061 | #if 0 |
16062 | // FIXME: This should be checked before unsafe fp atomics are enabled |
16063 | // Global FP atomic instructions have a hardcoded FP mode and do not support |
16064 | // FP32 denormals, and only support v2f16 denormals. |
16065 | static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { |
16066 | const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); |
16067 | auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt); |
16068 | if (&Flt == &APFloat::IEEEsingle()) |
16069 | return DenormMode == DenormalMode::getPreserveSign(); |
16070 | return DenormMode == DenormalMode::getIEEE(); |
16071 | } |
16072 | #endif |
16073 | |
16074 | // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe |
16075 | // floating point atomic instructions. May generate more efficient code, |
16076 | // but may not respect rounding and denormal modes, and may give incorrect |
16077 | // results for certain memory destinations. |
16078 | bool unsafeFPAtomicsDisabled(Function *F) { |
16079 | return F->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics" ).getValueAsString() != |
16080 | "true" ; |
16081 | } |
16082 | |
16083 | static OptimizationRemark (const AtomicRMWInst *RMW) { |
16084 | LLVMContext &Ctx = RMW->getContext(); |
16085 | SmallVector<StringRef> SSNs; |
16086 | Ctx.getSyncScopeNames(SSNs); |
16087 | StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty() |
16088 | ? "system" |
16089 | : SSNs[RMW->getSyncScopeID()]; |
16090 | |
16091 | return OptimizationRemark(DEBUG_TYPE, "Passed" , RMW) |
16092 | << "Hardware instruction generated for atomic " |
16093 | << RMW->getOperationName(Op: RMW->getOperation()) |
16094 | << " operation at memory scope " << MemScope; |
16095 | } |
16096 | |
16097 | static bool isHalf2OrBFloat2(Type *Ty) { |
16098 | if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) { |
16099 | Type *EltTy = VT->getElementType(); |
16100 | return VT->getNumElements() == 2 && |
16101 | (EltTy->isHalfTy() || EltTy->isBFloatTy()); |
16102 | } |
16103 | |
16104 | return false; |
16105 | } |
16106 | |
16107 | static bool isHalf2(Type *Ty) { |
16108 | FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty); |
16109 | return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); |
16110 | } |
16111 | |
16112 | static bool isBFloat2(Type *Ty) { |
16113 | FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty); |
16114 | return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); |
16115 | } |
16116 | |
16117 | TargetLowering::AtomicExpansionKind |
16118 | SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { |
16119 | unsigned AS = RMW->getPointerAddressSpace(); |
16120 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) |
16121 | return AtomicExpansionKind::NotAtomic; |
16122 | |
16123 | auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { |
16124 | OptimizationRemarkEmitter ORE(RMW->getFunction()); |
16125 | ORE.emit(RemarkBuilder: [=]() { |
16126 | return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request." ; |
16127 | }); |
16128 | return Kind; |
16129 | }; |
16130 | |
16131 | auto SSID = RMW->getSyncScopeID(); |
16132 | bool HasSystemScope = |
16133 | SSID == SyncScope::System || |
16134 | SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as" ); |
16135 | |
16136 | switch (RMW->getOperation()) { |
16137 | case AtomicRMWInst::Sub: |
16138 | case AtomicRMWInst::Or: |
16139 | case AtomicRMWInst::Xor: { |
16140 | // Atomic sub/or/xor do not work over PCI express, but atomic add |
16141 | // does. InstCombine transforms these with 0 to or, so undo that. |
16142 | if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) { |
16143 | if (Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand()); |
16144 | ConstVal && ConstVal->isNullValue()) |
16145 | return AtomicExpansionKind::Expand; |
16146 | } |
16147 | |
16148 | break; |
16149 | } |
16150 | case AtomicRMWInst::FAdd: { |
16151 | Type *Ty = RMW->getType(); |
16152 | |
16153 | // TODO: Handle REGION_ADDRESS |
16154 | if (AS == AMDGPUAS::LOCAL_ADDRESS) { |
16155 | // DS F32 FP atomics do respect the denormal mode, but the rounding mode |
16156 | // is fixed to round-to-nearest-even. |
16157 | // |
16158 | // F64 / PK_F16 / PK_BF16 never flush and are also fixed to |
16159 | // round-to-nearest-even. |
16160 | // |
16161 | // We ignore the rounding mode problem, even in strictfp. The C++ standard |
16162 | // suggests it is OK if the floating-point mode may not match the calling |
16163 | // thread. |
16164 | if (Ty->isFloatTy()) { |
16165 | return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None |
16166 | : AtomicExpansionKind::CmpXChg; |
16167 | } |
16168 | |
16169 | if (Ty->isDoubleTy()) { |
16170 | // Ignores denormal mode, but we don't consider flushing mandatory. |
16171 | return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None |
16172 | : AtomicExpansionKind::CmpXChg; |
16173 | } |
16174 | |
16175 | if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty)) |
16176 | return AtomicExpansionKind::None; |
16177 | |
16178 | return AtomicExpansionKind::CmpXChg; |
16179 | } |
16180 | |
16181 | if (!AMDGPU::isFlatGlobalAddrSpace(AS) && |
16182 | AS != AMDGPUAS::BUFFER_FAT_POINTER) |
16183 | return AtomicExpansionKind::CmpXChg; |
16184 | |
16185 | if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) |
16186 | return AtomicExpansionKind::None; |
16187 | |
16188 | if (AS == AMDGPUAS::FLAT_ADDRESS) { |
16189 | // gfx940, gfx12 |
16190 | // FIXME: Needs to account for no fine-grained memory |
16191 | if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) |
16192 | return AtomicExpansionKind::None; |
16193 | } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { |
16194 | // gfx90a, gfx940, gfx12 |
16195 | // FIXME: Needs to account for no fine-grained memory |
16196 | if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) |
16197 | return AtomicExpansionKind::None; |
16198 | |
16199 | // gfx940, gfx12 |
16200 | // FIXME: Needs to account for no fine-grained memory |
16201 | if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) |
16202 | return AtomicExpansionKind::None; |
16203 | } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { |
16204 | // gfx90a, gfx940, gfx12 |
16205 | // FIXME: Needs to account for no fine-grained memory |
16206 | if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) |
16207 | return AtomicExpansionKind::None; |
16208 | |
16209 | // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for |
16210 | // buffer. gfx12 does have the buffer version. |
16211 | if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty)) |
16212 | return AtomicExpansionKind::None; |
16213 | } |
16214 | |
16215 | if (unsafeFPAtomicsDisabled(F: RMW->getFunction())) |
16216 | return AtomicExpansionKind::CmpXChg; |
16217 | |
16218 | // Always expand system scope fp atomics. |
16219 | if (HasSystemScope) |
16220 | return AtomicExpansionKind::CmpXChg; |
16221 | |
16222 | // global and flat atomic fadd f64: gfx90a, gfx940. |
16223 | if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) |
16224 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16225 | |
16226 | if (AS != AMDGPUAS::FLAT_ADDRESS) { |
16227 | if (Ty->isFloatTy()) { |
16228 | // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. |
16229 | if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) |
16230 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16231 | // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. |
16232 | if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) |
16233 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16234 | } else { |
16235 | // gfx908 |
16236 | if (RMW->use_empty() && |
16237 | Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty)) |
16238 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16239 | } |
16240 | } |
16241 | |
16242 | // flat atomic fadd f32: gfx940, gfx11+. |
16243 | if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { |
16244 | if (Subtarget->hasFlatAtomicFaddF32Inst()) |
16245 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16246 | |
16247 | // If it is in flat address space, and the type is float, we will try to |
16248 | // expand it, if the target supports global and lds atomic fadd. The |
16249 | // reason we need that is, in the expansion, we emit the check of address |
16250 | // space. If it is in global address space, we emit the global atomic |
16251 | // fadd; if it is in shared address space, we emit the LDS atomic fadd. |
16252 | if (Subtarget->hasLDSFPAtomicAddF32()) { |
16253 | if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) |
16254 | return AtomicExpansionKind::Expand; |
16255 | if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) |
16256 | return AtomicExpansionKind::Expand; |
16257 | } |
16258 | } |
16259 | |
16260 | return AtomicExpansionKind::CmpXChg; |
16261 | } |
16262 | case AtomicRMWInst::FMin: |
16263 | case AtomicRMWInst::FMax: { |
16264 | Type *Ty = RMW->getType(); |
16265 | |
16266 | // LDS float and double fmin/fmax were always supported. |
16267 | if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy())) |
16268 | return AtomicExpansionKind::None; |
16269 | |
16270 | if (unsafeFPAtomicsDisabled(F: RMW->getFunction())) |
16271 | return AtomicExpansionKind::CmpXChg; |
16272 | |
16273 | // Always expand system scope fp atomics. |
16274 | if (HasSystemScope) |
16275 | return AtomicExpansionKind::CmpXChg; |
16276 | |
16277 | // For flat and global cases: |
16278 | // float, double in gfx7. Manual claims denormal support. |
16279 | // Removed in gfx8. |
16280 | // float, double restored in gfx10. |
16281 | // double removed again in gfx11, so only f32 for gfx11/gfx12. |
16282 | // |
16283 | // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no |
16284 | // f32. |
16285 | // |
16286 | // FIXME: Check scope and fine grained memory |
16287 | if (AS == AMDGPUAS::FLAT_ADDRESS) { |
16288 | if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) |
16289 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16290 | if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) |
16291 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16292 | } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || |
16293 | AS == AMDGPUAS::BUFFER_FAT_POINTER) { |
16294 | if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) |
16295 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16296 | if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) |
16297 | return ReportUnsafeHWInst(AtomicExpansionKind::None); |
16298 | } |
16299 | |
16300 | return AtomicExpansionKind::CmpXChg; |
16301 | } |
16302 | case AtomicRMWInst::Min: |
16303 | case AtomicRMWInst::Max: |
16304 | case AtomicRMWInst::UMin: |
16305 | case AtomicRMWInst::UMax: { |
16306 | if (AMDGPU::isFlatGlobalAddrSpace(AS) || |
16307 | AS == AMDGPUAS::BUFFER_FAT_POINTER) { |
16308 | // Always expand system scope min/max atomics. |
16309 | if (HasSystemScope) |
16310 | return AtomicExpansionKind::CmpXChg; |
16311 | } |
16312 | break; |
16313 | } |
16314 | default: |
16315 | break; |
16316 | } |
16317 | |
16318 | return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); |
16319 | } |
16320 | |
16321 | TargetLowering::AtomicExpansionKind |
16322 | SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { |
16323 | return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS |
16324 | ? AtomicExpansionKind::NotAtomic |
16325 | : AtomicExpansionKind::None; |
16326 | } |
16327 | |
16328 | TargetLowering::AtomicExpansionKind |
16329 | SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { |
16330 | return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS |
16331 | ? AtomicExpansionKind::NotAtomic |
16332 | : AtomicExpansionKind::None; |
16333 | } |
16334 | |
16335 | TargetLowering::AtomicExpansionKind |
16336 | SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { |
16337 | return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS |
16338 | ? AtomicExpansionKind::NotAtomic |
16339 | : AtomicExpansionKind::None; |
16340 | } |
16341 | |
16342 | const TargetRegisterClass * |
16343 | SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { |
16344 | const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false); |
16345 | const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); |
16346 | if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) |
16347 | return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass |
16348 | : &AMDGPU::SReg_32RegClass; |
16349 | if (!TRI->isSGPRClass(RC) && !isDivergent) |
16350 | return TRI->getEquivalentSGPRClass(VRC: RC); |
16351 | if (TRI->isSGPRClass(RC) && isDivergent) |
16352 | return TRI->getEquivalentVGPRClass(SRC: RC); |
16353 | |
16354 | return RC; |
16355 | } |
16356 | |
16357 | // FIXME: This is a workaround for DivergenceAnalysis not understanding always |
16358 | // uniform values (as produced by the mask results of control flow intrinsics) |
16359 | // used outside of divergent blocks. The phi users need to also be treated as |
16360 | // always uniform. |
16361 | // |
16362 | // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis? |
16363 | static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, |
16364 | unsigned WaveSize) { |
16365 | // FIXME: We assume we never cast the mask results of a control flow |
16366 | // intrinsic. |
16367 | // Early exit if the type won't be consistent as a compile time hack. |
16368 | IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType()); |
16369 | if (!IT || IT->getBitWidth() != WaveSize) |
16370 | return false; |
16371 | |
16372 | if (!isa<Instruction>(Val: V)) |
16373 | return false; |
16374 | if (!Visited.insert(Ptr: V).second) |
16375 | return false; |
16376 | bool Result = false; |
16377 | for (const auto *U : V->users()) { |
16378 | if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) { |
16379 | if (V == U->getOperand(i: 1)) { |
16380 | switch (Intrinsic->getIntrinsicID()) { |
16381 | default: |
16382 | Result = false; |
16383 | break; |
16384 | case Intrinsic::amdgcn_if_break: |
16385 | case Intrinsic::amdgcn_if: |
16386 | case Intrinsic::amdgcn_else: |
16387 | Result = true; |
16388 | break; |
16389 | } |
16390 | } |
16391 | if (V == U->getOperand(i: 0)) { |
16392 | switch (Intrinsic->getIntrinsicID()) { |
16393 | default: |
16394 | Result = false; |
16395 | break; |
16396 | case Intrinsic::amdgcn_end_cf: |
16397 | case Intrinsic::amdgcn_loop: |
16398 | Result = true; |
16399 | break; |
16400 | } |
16401 | } |
16402 | } else { |
16403 | Result = hasCFUser(V: U, Visited, WaveSize); |
16404 | } |
16405 | if (Result) |
16406 | break; |
16407 | } |
16408 | return Result; |
16409 | } |
16410 | |
16411 | bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, |
16412 | const Value *V) const { |
16413 | if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) { |
16414 | if (CI->isInlineAsm()) { |
16415 | // FIXME: This cannot give a correct answer. This should only trigger in |
16416 | // the case where inline asm returns mixed SGPR and VGPR results, used |
16417 | // outside the defining block. We don't have a specific result to |
16418 | // consider, so this assumes if any value is SGPR, the overall register |
16419 | // also needs to be SGPR. |
16420 | const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); |
16421 | TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( |
16422 | DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI); |
16423 | for (auto &TC : TargetConstraints) { |
16424 | if (TC.Type == InlineAsm::isOutput) { |
16425 | ComputeConstraintToUse(OpInfo&: TC, Op: SDValue()); |
16426 | const TargetRegisterClass *RC = getRegForInlineAsmConstraint( |
16427 | TRI_: SIRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second; |
16428 | if (RC && SIRI->isSGPRClass(RC)) |
16429 | return true; |
16430 | } |
16431 | } |
16432 | } |
16433 | } |
16434 | SmallPtrSet<const Value *, 16> Visited; |
16435 | return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize()); |
16436 | } |
16437 | |
16438 | bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { |
16439 | SDNode::use_iterator I = N->use_begin(), E = N->use_end(); |
16440 | for (; I != E; ++I) { |
16441 | if (MemSDNode *M = dyn_cast<MemSDNode>(Val: *I)) { |
16442 | if (getBasePtrIndex(N: M) == I.getOperandNo()) |
16443 | return true; |
16444 | } |
16445 | } |
16446 | return false; |
16447 | } |
16448 | |
16449 | bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, |
16450 | SDValue N1) const { |
16451 | if (!N0.hasOneUse()) |
16452 | return false; |
16453 | // Take care of the opportunity to keep N0 uniform |
16454 | if (N0->isDivergent() || !N1->isDivergent()) |
16455 | return true; |
16456 | // Check if we have a good chance to form the memory access pattern with the |
16457 | // base and offset |
16458 | return (DAG.isBaseWithConstantOffset(Op: N0) && |
16459 | hasMemSDNodeUser(N: *N0->use_begin())); |
16460 | } |
16461 | |
16462 | bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, |
16463 | Register N0, Register N1) const { |
16464 | return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks |
16465 | } |
16466 | |
16467 | MachineMemOperand::Flags |
16468 | SITargetLowering::getTargetMMOFlags(const Instruction &I) const { |
16469 | // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. |
16470 | MachineMemOperand::Flags Flags = MachineMemOperand::MONone; |
16471 | if (I.getMetadata(Kind: "amdgpu.noclobber" )) |
16472 | Flags |= MONoClobber; |
16473 | if (I.getMetadata(Kind: "amdgpu.last.use" )) |
16474 | Flags |= MOLastUse; |
16475 | return Flags; |
16476 | } |
16477 | |
16478 | bool SITargetLowering::checkForPhysRegDependency( |
16479 | SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, |
16480 | const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const { |
16481 | if (User->getOpcode() != ISD::CopyToReg) |
16482 | return false; |
16483 | if (!Def->isMachineOpcode()) |
16484 | return false; |
16485 | MachineSDNode *MDef = dyn_cast<MachineSDNode>(Val: Def); |
16486 | if (!MDef) |
16487 | return false; |
16488 | |
16489 | unsigned ResNo = User->getOperand(Num: Op).getResNo(); |
16490 | if (User->getOperand(Num: Op)->getValueType(ResNo) != MVT::i1) |
16491 | return false; |
16492 | const MCInstrDesc &II = TII->get(Opcode: MDef->getMachineOpcode()); |
16493 | if (II.isCompare() && II.hasImplicitDefOfPhysReg(Reg: AMDGPU::SCC)) { |
16494 | PhysReg = AMDGPU::SCC; |
16495 | const TargetRegisterClass *RC = |
16496 | TRI->getMinimalPhysRegClass(Reg: PhysReg, VT: Def->getSimpleValueType(ResNo)); |
16497 | Cost = RC->getCopyCost(); |
16498 | return true; |
16499 | } |
16500 | return false; |
16501 | } |
16502 | |
16503 | void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { |
16504 | AtomicRMWInst::BinOp Op = AI->getOperation(); |
16505 | |
16506 | if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || |
16507 | Op == AtomicRMWInst::Xor) { |
16508 | // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 |
16509 | assert(cast<Constant>(AI->getValOperand())->isNullValue() && |
16510 | "this cannot be replaced with add" ); |
16511 | AI->setOperation(AtomicRMWInst::Add); |
16512 | return; |
16513 | } |
16514 | |
16515 | assert(Subtarget->hasAtomicFaddInsts() && |
16516 | "target should have atomic fadd instructions" ); |
16517 | assert(AI->getType()->isFloatTy() && |
16518 | AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && |
16519 | "generic atomicrmw expansion only supports FP32 operand in flat " |
16520 | "address space" ); |
16521 | assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now" ); |
16522 | |
16523 | // Given: atomicrmw fadd ptr %addr, float %val ordering |
16524 | // |
16525 | // With this expansion we produce the following code: |
16526 | // [...] |
16527 | // br label %atomicrmw.check.shared |
16528 | // |
16529 | // atomicrmw.check.shared: |
16530 | // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr) |
16531 | // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private |
16532 | // |
16533 | // atomicrmw.shared: |
16534 | // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3) |
16535 | // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared, |
16536 | // float %val ordering |
16537 | // br label %atomicrmw.phi |
16538 | // |
16539 | // atomicrmw.check.private: |
16540 | // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr) |
16541 | // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global |
16542 | // |
16543 | // atomicrmw.private: |
16544 | // %cast.private = addrspacecast ptr %addr to ptr addrspace(5) |
16545 | // %loaded.private = load float, ptr addrspace(5) %cast.private |
16546 | // %val.new = fadd float %loaded.private, %val |
16547 | // store float %val.new, ptr addrspace(5) %cast.private |
16548 | // br label %atomicrmw.phi |
16549 | // |
16550 | // atomicrmw.global: |
16551 | // %cast.global = addrspacecast ptr %addr to ptr addrspace(1) |
16552 | // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global, |
16553 | // float %val ordering |
16554 | // br label %atomicrmw.phi |
16555 | // |
16556 | // atomicrmw.phi: |
16557 | // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ], |
16558 | // [ %loaded.private, %atomicrmw.private ], |
16559 | // [ %loaded.global, %atomicrmw.global ] |
16560 | // br label %atomicrmw.end |
16561 | // |
16562 | // atomicrmw.end: |
16563 | // [...] |
16564 | |
16565 | IRBuilder<> Builder(AI); |
16566 | LLVMContext &Ctx = Builder.getContext(); |
16567 | |
16568 | BasicBlock *BB = Builder.GetInsertBlock(); |
16569 | Function *F = BB->getParent(); |
16570 | BasicBlock *ExitBB = |
16571 | BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end" ); |
16572 | BasicBlock *CheckSharedBB = |
16573 | BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.shared" , Parent: F, InsertBefore: ExitBB); |
16574 | BasicBlock *SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared" , Parent: F, InsertBefore: ExitBB); |
16575 | BasicBlock *CheckPrivateBB = |
16576 | BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private" , Parent: F, InsertBefore: ExitBB); |
16577 | BasicBlock *PrivateBB = |
16578 | BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private" , Parent: F, InsertBefore: ExitBB); |
16579 | BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global" , Parent: F, InsertBefore: ExitBB); |
16580 | BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi" , Parent: F, InsertBefore: ExitBB); |
16581 | |
16582 | Value *Val = AI->getValOperand(); |
16583 | Type *ValTy = Val->getType(); |
16584 | Value *Addr = AI->getPointerOperand(); |
16585 | |
16586 | auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr, |
16587 | Value *Val) -> Value * { |
16588 | AtomicRMWInst *OldVal = |
16589 | Builder.CreateAtomicRMW(Op: AI->getOperation(), Ptr: Addr, Val, Align: AI->getAlign(), |
16590 | Ordering: AI->getOrdering(), SSID: AI->getSyncScopeID()); |
16591 | SmallVector<std::pair<unsigned, MDNode *>> MDs; |
16592 | AI->getAllMetadata(MDs); |
16593 | for (auto &P : MDs) |
16594 | OldVal->setMetadata(KindID: P.first, Node: P.second); |
16595 | return OldVal; |
16596 | }; |
16597 | |
16598 | std::prev(x: BB->end())->eraseFromParent(); |
16599 | Builder.SetInsertPoint(BB); |
16600 | Builder.CreateBr(Dest: CheckSharedBB); |
16601 | |
16602 | Builder.SetInsertPoint(CheckSharedBB); |
16603 | CallInst *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared, Types: {}, |
16604 | Args: {Addr}, FMFSource: nullptr, Name: "is.shared" ); |
16605 | Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB); |
16606 | |
16607 | Builder.SetInsertPoint(SharedBB); |
16608 | Value *CastToLocal = Builder.CreateAddrSpaceCast( |
16609 | V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)); |
16610 | Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val); |
16611 | Builder.CreateBr(Dest: PhiBB); |
16612 | |
16613 | Builder.SetInsertPoint(CheckPrivateBB); |
16614 | CallInst *IsPrivate = Builder.CreateIntrinsic( |
16615 | ID: Intrinsic::amdgcn_is_private, Types: {}, Args: {Addr}, FMFSource: nullptr, Name: "is.private" ); |
16616 | Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB); |
16617 | |
16618 | Builder.SetInsertPoint(PrivateBB); |
16619 | Value *CastToPrivate = Builder.CreateAddrSpaceCast( |
16620 | V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS)); |
16621 | Value *LoadedPrivate = |
16622 | Builder.CreateLoad(Ty: ValTy, Ptr: CastToPrivate, Name: "loaded.private" ); |
16623 | Value *NewVal = Builder.CreateFAdd(L: LoadedPrivate, R: Val, Name: "val.new" ); |
16624 | Builder.CreateStore(Val: NewVal, Ptr: CastToPrivate); |
16625 | Builder.CreateBr(Dest: PhiBB); |
16626 | |
16627 | Builder.SetInsertPoint(GlobalBB); |
16628 | Value *CastToGlobal = Builder.CreateAddrSpaceCast( |
16629 | V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS)); |
16630 | Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val); |
16631 | Builder.CreateBr(Dest: PhiBB); |
16632 | |
16633 | Builder.SetInsertPoint(PhiBB); |
16634 | PHINode *Loaded = Builder.CreatePHI(Ty: ValTy, NumReservedValues: 3, Name: "loaded.phi" ); |
16635 | Loaded->addIncoming(V: LoadedShared, BB: SharedBB); |
16636 | Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB); |
16637 | Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB); |
16638 | Builder.CreateBr(Dest: ExitBB); |
16639 | |
16640 | AI->replaceAllUsesWith(V: Loaded); |
16641 | AI->eraseFromParent(); |
16642 | } |
16643 | |
16644 | LoadInst * |
16645 | SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { |
16646 | IRBuilder<> Builder(AI); |
16647 | auto Order = AI->getOrdering(); |
16648 | |
16649 | // The optimization removes store aspect of the atomicrmw. Therefore, cache |
16650 | // must be flushed if the atomic ordering had a release semantics. This is |
16651 | // not necessary a fence, a release fence just coincides to do that flush. |
16652 | // Avoid replacing of an atomicrmw with a release semantics. |
16653 | if (isReleaseOrStronger(AO: Order)) |
16654 | return nullptr; |
16655 | |
16656 | LoadInst *LI = Builder.CreateAlignedLoad( |
16657 | Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign()); |
16658 | LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID()); |
16659 | LI->copyMetadata(SrcInst: *AI); |
16660 | LI->takeName(V: AI); |
16661 | AI->replaceAllUsesWith(V: LI); |
16662 | AI->eraseFromParent(); |
16663 | return LI; |
16664 | } |
16665 | |