1 | //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines an instruction selector for the NVPTX target. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "NVPTXISelDAGToDAG.h" |
14 | #include "MCTargetDesc/NVPTXBaseInfo.h" |
15 | #include "NVPTXUtilities.h" |
16 | #include "llvm/Analysis/ValueTracking.h" |
17 | #include "llvm/CodeGen/ISDOpcodes.h" |
18 | #include "llvm/IR/GlobalValue.h" |
19 | #include "llvm/IR/Instructions.h" |
20 | #include "llvm/IR/IntrinsicsNVPTX.h" |
21 | #include "llvm/Support/AtomicOrdering.h" |
22 | #include "llvm/Support/CommandLine.h" |
23 | #include "llvm/Support/Debug.h" |
24 | #include "llvm/Support/ErrorHandling.h" |
25 | #include "llvm/Support/raw_ostream.h" |
26 | #include "llvm/Target/TargetIntrinsicInfo.h" |
27 | |
28 | using namespace llvm; |
29 | |
30 | #define DEBUG_TYPE "nvptx-isel" |
31 | #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection" |
32 | |
33 | static cl::opt<bool> |
34 | EnableRsqrtOpt("nvptx-rsqrt-approx-opt" , cl::init(Val: true), cl::Hidden, |
35 | cl::desc("Enable reciprocal sqrt optimization" )); |
36 | |
37 | /// createNVPTXISelDag - This pass converts a legalized DAG into a |
38 | /// NVPTX-specific DAG, ready for instruction scheduling. |
39 | FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, |
40 | llvm::CodeGenOptLevel OptLevel) { |
41 | return new NVPTXDAGToDAGISelLegacy(TM, OptLevel); |
42 | } |
43 | |
44 | NVPTXDAGToDAGISelLegacy::NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, |
45 | CodeGenOptLevel OptLevel) |
46 | : SelectionDAGISelLegacy( |
47 | ID, std::make_unique<NVPTXDAGToDAGISel>(args&: tm, args&: OptLevel)) {} |
48 | |
49 | char NVPTXDAGToDAGISelLegacy::ID = 0; |
50 | |
51 | INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) |
52 | |
53 | NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, |
54 | CodeGenOptLevel OptLevel) |
55 | : SelectionDAGISel(tm, OptLevel), TM(tm) { |
56 | doMulWide = (OptLevel > CodeGenOptLevel::None); |
57 | } |
58 | |
59 | bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { |
60 | Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); |
61 | return SelectionDAGISel::runOnMachineFunction(mf&: MF); |
62 | } |
63 | |
64 | int NVPTXDAGToDAGISel::getDivF32Level() const { |
65 | return Subtarget->getTargetLowering()->getDivF32Level(); |
66 | } |
67 | |
68 | bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { |
69 | return Subtarget->getTargetLowering()->usePrecSqrtF32(); |
70 | } |
71 | |
72 | bool NVPTXDAGToDAGISel::useF32FTZ() const { |
73 | return Subtarget->getTargetLowering()->useF32FTZ(MF: *MF); |
74 | } |
75 | |
76 | bool NVPTXDAGToDAGISel::allowFMA() const { |
77 | const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); |
78 | return TL->allowFMA(MF&: *MF, OptLevel); |
79 | } |
80 | |
81 | bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { |
82 | const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); |
83 | return TL->allowUnsafeFPMath(MF&: *MF); |
84 | } |
85 | |
86 | bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; } |
87 | |
88 | /// Select - Select instructions not customized! Used for |
89 | /// expanded, promoted and normal instructions. |
90 | void NVPTXDAGToDAGISel::Select(SDNode *N) { |
91 | |
92 | if (N->isMachineOpcode()) { |
93 | N->setNodeId(-1); |
94 | return; // Already selected. |
95 | } |
96 | |
97 | switch (N->getOpcode()) { |
98 | case ISD::LOAD: |
99 | case ISD::ATOMIC_LOAD: |
100 | if (tryLoad(N)) |
101 | return; |
102 | break; |
103 | case ISD::STORE: |
104 | case ISD::ATOMIC_STORE: |
105 | if (tryStore(N)) |
106 | return; |
107 | break; |
108 | case ISD::EXTRACT_VECTOR_ELT: |
109 | if (tryEXTRACT_VECTOR_ELEMENT(N)) |
110 | return; |
111 | break; |
112 | case NVPTXISD::SETP_F16X2: |
113 | SelectSETP_F16X2(N); |
114 | return; |
115 | case NVPTXISD::SETP_BF16X2: |
116 | SelectSETP_BF16X2(N); |
117 | return; |
118 | case NVPTXISD::LoadV2: |
119 | case NVPTXISD::LoadV4: |
120 | if (tryLoadVector(N)) |
121 | return; |
122 | break; |
123 | case NVPTXISD::LDGV2: |
124 | case NVPTXISD::LDGV4: |
125 | case NVPTXISD::LDUV2: |
126 | case NVPTXISD::LDUV4: |
127 | if (tryLDGLDU(N)) |
128 | return; |
129 | break; |
130 | case NVPTXISD::StoreV2: |
131 | case NVPTXISD::StoreV4: |
132 | if (tryStoreVector(N)) |
133 | return; |
134 | break; |
135 | case NVPTXISD::LoadParam: |
136 | case NVPTXISD::LoadParamV2: |
137 | case NVPTXISD::LoadParamV4: |
138 | if (tryLoadParam(N)) |
139 | return; |
140 | break; |
141 | case NVPTXISD::StoreRetval: |
142 | case NVPTXISD::StoreRetvalV2: |
143 | case NVPTXISD::StoreRetvalV4: |
144 | if (tryStoreRetval(N)) |
145 | return; |
146 | break; |
147 | case NVPTXISD::StoreParam: |
148 | case NVPTXISD::StoreParamV2: |
149 | case NVPTXISD::StoreParamV4: |
150 | case NVPTXISD::StoreParamS32: |
151 | case NVPTXISD::StoreParamU32: |
152 | if (tryStoreParam(N)) |
153 | return; |
154 | break; |
155 | case ISD::INTRINSIC_WO_CHAIN: |
156 | if (tryIntrinsicNoChain(N)) |
157 | return; |
158 | break; |
159 | case ISD::INTRINSIC_W_CHAIN: |
160 | if (tryIntrinsicChain(N)) |
161 | return; |
162 | break; |
163 | case NVPTXISD::Tex1DFloatS32: |
164 | case NVPTXISD::Tex1DFloatFloat: |
165 | case NVPTXISD::Tex1DFloatFloatLevel: |
166 | case NVPTXISD::Tex1DFloatFloatGrad: |
167 | case NVPTXISD::Tex1DS32S32: |
168 | case NVPTXISD::Tex1DS32Float: |
169 | case NVPTXISD::Tex1DS32FloatLevel: |
170 | case NVPTXISD::Tex1DS32FloatGrad: |
171 | case NVPTXISD::Tex1DU32S32: |
172 | case NVPTXISD::Tex1DU32Float: |
173 | case NVPTXISD::Tex1DU32FloatLevel: |
174 | case NVPTXISD::Tex1DU32FloatGrad: |
175 | case NVPTXISD::Tex1DArrayFloatS32: |
176 | case NVPTXISD::Tex1DArrayFloatFloat: |
177 | case NVPTXISD::Tex1DArrayFloatFloatLevel: |
178 | case NVPTXISD::Tex1DArrayFloatFloatGrad: |
179 | case NVPTXISD::Tex1DArrayS32S32: |
180 | case NVPTXISD::Tex1DArrayS32Float: |
181 | case NVPTXISD::Tex1DArrayS32FloatLevel: |
182 | case NVPTXISD::Tex1DArrayS32FloatGrad: |
183 | case NVPTXISD::Tex1DArrayU32S32: |
184 | case NVPTXISD::Tex1DArrayU32Float: |
185 | case NVPTXISD::Tex1DArrayU32FloatLevel: |
186 | case NVPTXISD::Tex1DArrayU32FloatGrad: |
187 | case NVPTXISD::Tex2DFloatS32: |
188 | case NVPTXISD::Tex2DFloatFloat: |
189 | case NVPTXISD::Tex2DFloatFloatLevel: |
190 | case NVPTXISD::Tex2DFloatFloatGrad: |
191 | case NVPTXISD::Tex2DS32S32: |
192 | case NVPTXISD::Tex2DS32Float: |
193 | case NVPTXISD::Tex2DS32FloatLevel: |
194 | case NVPTXISD::Tex2DS32FloatGrad: |
195 | case NVPTXISD::Tex2DU32S32: |
196 | case NVPTXISD::Tex2DU32Float: |
197 | case NVPTXISD::Tex2DU32FloatLevel: |
198 | case NVPTXISD::Tex2DU32FloatGrad: |
199 | case NVPTXISD::Tex2DArrayFloatS32: |
200 | case NVPTXISD::Tex2DArrayFloatFloat: |
201 | case NVPTXISD::Tex2DArrayFloatFloatLevel: |
202 | case NVPTXISD::Tex2DArrayFloatFloatGrad: |
203 | case NVPTXISD::Tex2DArrayS32S32: |
204 | case NVPTXISD::Tex2DArrayS32Float: |
205 | case NVPTXISD::Tex2DArrayS32FloatLevel: |
206 | case NVPTXISD::Tex2DArrayS32FloatGrad: |
207 | case NVPTXISD::Tex2DArrayU32S32: |
208 | case NVPTXISD::Tex2DArrayU32Float: |
209 | case NVPTXISD::Tex2DArrayU32FloatLevel: |
210 | case NVPTXISD::Tex2DArrayU32FloatGrad: |
211 | case NVPTXISD::Tex3DFloatS32: |
212 | case NVPTXISD::Tex3DFloatFloat: |
213 | case NVPTXISD::Tex3DFloatFloatLevel: |
214 | case NVPTXISD::Tex3DFloatFloatGrad: |
215 | case NVPTXISD::Tex3DS32S32: |
216 | case NVPTXISD::Tex3DS32Float: |
217 | case NVPTXISD::Tex3DS32FloatLevel: |
218 | case NVPTXISD::Tex3DS32FloatGrad: |
219 | case NVPTXISD::Tex3DU32S32: |
220 | case NVPTXISD::Tex3DU32Float: |
221 | case NVPTXISD::Tex3DU32FloatLevel: |
222 | case NVPTXISD::Tex3DU32FloatGrad: |
223 | case NVPTXISD::TexCubeFloatFloat: |
224 | case NVPTXISD::TexCubeFloatFloatLevel: |
225 | case NVPTXISD::TexCubeS32Float: |
226 | case NVPTXISD::TexCubeS32FloatLevel: |
227 | case NVPTXISD::TexCubeU32Float: |
228 | case NVPTXISD::TexCubeU32FloatLevel: |
229 | case NVPTXISD::TexCubeArrayFloatFloat: |
230 | case NVPTXISD::TexCubeArrayFloatFloatLevel: |
231 | case NVPTXISD::TexCubeArrayS32Float: |
232 | case NVPTXISD::TexCubeArrayS32FloatLevel: |
233 | case NVPTXISD::TexCubeArrayU32Float: |
234 | case NVPTXISD::TexCubeArrayU32FloatLevel: |
235 | case NVPTXISD::Tld4R2DFloatFloat: |
236 | case NVPTXISD::Tld4G2DFloatFloat: |
237 | case NVPTXISD::Tld4B2DFloatFloat: |
238 | case NVPTXISD::Tld4A2DFloatFloat: |
239 | case NVPTXISD::Tld4R2DS64Float: |
240 | case NVPTXISD::Tld4G2DS64Float: |
241 | case NVPTXISD::Tld4B2DS64Float: |
242 | case NVPTXISD::Tld4A2DS64Float: |
243 | case NVPTXISD::Tld4R2DU64Float: |
244 | case NVPTXISD::Tld4G2DU64Float: |
245 | case NVPTXISD::Tld4B2DU64Float: |
246 | case NVPTXISD::Tld4A2DU64Float: |
247 | case NVPTXISD::TexUnified1DFloatS32: |
248 | case NVPTXISD::TexUnified1DFloatFloat: |
249 | case NVPTXISD::TexUnified1DFloatFloatLevel: |
250 | case NVPTXISD::TexUnified1DFloatFloatGrad: |
251 | case NVPTXISD::TexUnified1DS32S32: |
252 | case NVPTXISD::TexUnified1DS32Float: |
253 | case NVPTXISD::TexUnified1DS32FloatLevel: |
254 | case NVPTXISD::TexUnified1DS32FloatGrad: |
255 | case NVPTXISD::TexUnified1DU32S32: |
256 | case NVPTXISD::TexUnified1DU32Float: |
257 | case NVPTXISD::TexUnified1DU32FloatLevel: |
258 | case NVPTXISD::TexUnified1DU32FloatGrad: |
259 | case NVPTXISD::TexUnified1DArrayFloatS32: |
260 | case NVPTXISD::TexUnified1DArrayFloatFloat: |
261 | case NVPTXISD::TexUnified1DArrayFloatFloatLevel: |
262 | case NVPTXISD::TexUnified1DArrayFloatFloatGrad: |
263 | case NVPTXISD::TexUnified1DArrayS32S32: |
264 | case NVPTXISD::TexUnified1DArrayS32Float: |
265 | case NVPTXISD::TexUnified1DArrayS32FloatLevel: |
266 | case NVPTXISD::TexUnified1DArrayS32FloatGrad: |
267 | case NVPTXISD::TexUnified1DArrayU32S32: |
268 | case NVPTXISD::TexUnified1DArrayU32Float: |
269 | case NVPTXISD::TexUnified1DArrayU32FloatLevel: |
270 | case NVPTXISD::TexUnified1DArrayU32FloatGrad: |
271 | case NVPTXISD::TexUnified2DFloatS32: |
272 | case NVPTXISD::TexUnified2DFloatFloat: |
273 | case NVPTXISD::TexUnified2DFloatFloatLevel: |
274 | case NVPTXISD::TexUnified2DFloatFloatGrad: |
275 | case NVPTXISD::TexUnified2DS32S32: |
276 | case NVPTXISD::TexUnified2DS32Float: |
277 | case NVPTXISD::TexUnified2DS32FloatLevel: |
278 | case NVPTXISD::TexUnified2DS32FloatGrad: |
279 | case NVPTXISD::TexUnified2DU32S32: |
280 | case NVPTXISD::TexUnified2DU32Float: |
281 | case NVPTXISD::TexUnified2DU32FloatLevel: |
282 | case NVPTXISD::TexUnified2DU32FloatGrad: |
283 | case NVPTXISD::TexUnified2DArrayFloatS32: |
284 | case NVPTXISD::TexUnified2DArrayFloatFloat: |
285 | case NVPTXISD::TexUnified2DArrayFloatFloatLevel: |
286 | case NVPTXISD::TexUnified2DArrayFloatFloatGrad: |
287 | case NVPTXISD::TexUnified2DArrayS32S32: |
288 | case NVPTXISD::TexUnified2DArrayS32Float: |
289 | case NVPTXISD::TexUnified2DArrayS32FloatLevel: |
290 | case NVPTXISD::TexUnified2DArrayS32FloatGrad: |
291 | case NVPTXISD::TexUnified2DArrayU32S32: |
292 | case NVPTXISD::TexUnified2DArrayU32Float: |
293 | case NVPTXISD::TexUnified2DArrayU32FloatLevel: |
294 | case NVPTXISD::TexUnified2DArrayU32FloatGrad: |
295 | case NVPTXISD::TexUnified3DFloatS32: |
296 | case NVPTXISD::TexUnified3DFloatFloat: |
297 | case NVPTXISD::TexUnified3DFloatFloatLevel: |
298 | case NVPTXISD::TexUnified3DFloatFloatGrad: |
299 | case NVPTXISD::TexUnified3DS32S32: |
300 | case NVPTXISD::TexUnified3DS32Float: |
301 | case NVPTXISD::TexUnified3DS32FloatLevel: |
302 | case NVPTXISD::TexUnified3DS32FloatGrad: |
303 | case NVPTXISD::TexUnified3DU32S32: |
304 | case NVPTXISD::TexUnified3DU32Float: |
305 | case NVPTXISD::TexUnified3DU32FloatLevel: |
306 | case NVPTXISD::TexUnified3DU32FloatGrad: |
307 | case NVPTXISD::TexUnifiedCubeFloatFloat: |
308 | case NVPTXISD::TexUnifiedCubeFloatFloatLevel: |
309 | case NVPTXISD::TexUnifiedCubeS32Float: |
310 | case NVPTXISD::TexUnifiedCubeS32FloatLevel: |
311 | case NVPTXISD::TexUnifiedCubeU32Float: |
312 | case NVPTXISD::TexUnifiedCubeU32FloatLevel: |
313 | case NVPTXISD::TexUnifiedCubeArrayFloatFloat: |
314 | case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: |
315 | case NVPTXISD::TexUnifiedCubeArrayS32Float: |
316 | case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: |
317 | case NVPTXISD::TexUnifiedCubeArrayU32Float: |
318 | case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: |
319 | case NVPTXISD::TexUnifiedCubeFloatFloatGrad: |
320 | case NVPTXISD::TexUnifiedCubeS32FloatGrad: |
321 | case NVPTXISD::TexUnifiedCubeU32FloatGrad: |
322 | case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: |
323 | case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: |
324 | case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: |
325 | case NVPTXISD::Tld4UnifiedR2DFloatFloat: |
326 | case NVPTXISD::Tld4UnifiedG2DFloatFloat: |
327 | case NVPTXISD::Tld4UnifiedB2DFloatFloat: |
328 | case NVPTXISD::Tld4UnifiedA2DFloatFloat: |
329 | case NVPTXISD::Tld4UnifiedR2DS64Float: |
330 | case NVPTXISD::Tld4UnifiedG2DS64Float: |
331 | case NVPTXISD::Tld4UnifiedB2DS64Float: |
332 | case NVPTXISD::Tld4UnifiedA2DS64Float: |
333 | case NVPTXISD::Tld4UnifiedR2DU64Float: |
334 | case NVPTXISD::Tld4UnifiedG2DU64Float: |
335 | case NVPTXISD::Tld4UnifiedB2DU64Float: |
336 | case NVPTXISD::Tld4UnifiedA2DU64Float: |
337 | if (tryTextureIntrinsic(N)) |
338 | return; |
339 | break; |
340 | case NVPTXISD::Suld1DI8Clamp: |
341 | case NVPTXISD::Suld1DI16Clamp: |
342 | case NVPTXISD::Suld1DI32Clamp: |
343 | case NVPTXISD::Suld1DI64Clamp: |
344 | case NVPTXISD::Suld1DV2I8Clamp: |
345 | case NVPTXISD::Suld1DV2I16Clamp: |
346 | case NVPTXISD::Suld1DV2I32Clamp: |
347 | case NVPTXISD::Suld1DV2I64Clamp: |
348 | case NVPTXISD::Suld1DV4I8Clamp: |
349 | case NVPTXISD::Suld1DV4I16Clamp: |
350 | case NVPTXISD::Suld1DV4I32Clamp: |
351 | case NVPTXISD::Suld1DArrayI8Clamp: |
352 | case NVPTXISD::Suld1DArrayI16Clamp: |
353 | case NVPTXISD::Suld1DArrayI32Clamp: |
354 | case NVPTXISD::Suld1DArrayI64Clamp: |
355 | case NVPTXISD::Suld1DArrayV2I8Clamp: |
356 | case NVPTXISD::Suld1DArrayV2I16Clamp: |
357 | case NVPTXISD::Suld1DArrayV2I32Clamp: |
358 | case NVPTXISD::Suld1DArrayV2I64Clamp: |
359 | case NVPTXISD::Suld1DArrayV4I8Clamp: |
360 | case NVPTXISD::Suld1DArrayV4I16Clamp: |
361 | case NVPTXISD::Suld1DArrayV4I32Clamp: |
362 | case NVPTXISD::Suld2DI8Clamp: |
363 | case NVPTXISD::Suld2DI16Clamp: |
364 | case NVPTXISD::Suld2DI32Clamp: |
365 | case NVPTXISD::Suld2DI64Clamp: |
366 | case NVPTXISD::Suld2DV2I8Clamp: |
367 | case NVPTXISD::Suld2DV2I16Clamp: |
368 | case NVPTXISD::Suld2DV2I32Clamp: |
369 | case NVPTXISD::Suld2DV2I64Clamp: |
370 | case NVPTXISD::Suld2DV4I8Clamp: |
371 | case NVPTXISD::Suld2DV4I16Clamp: |
372 | case NVPTXISD::Suld2DV4I32Clamp: |
373 | case NVPTXISD::Suld2DArrayI8Clamp: |
374 | case NVPTXISD::Suld2DArrayI16Clamp: |
375 | case NVPTXISD::Suld2DArrayI32Clamp: |
376 | case NVPTXISD::Suld2DArrayI64Clamp: |
377 | case NVPTXISD::Suld2DArrayV2I8Clamp: |
378 | case NVPTXISD::Suld2DArrayV2I16Clamp: |
379 | case NVPTXISD::Suld2DArrayV2I32Clamp: |
380 | case NVPTXISD::Suld2DArrayV2I64Clamp: |
381 | case NVPTXISD::Suld2DArrayV4I8Clamp: |
382 | case NVPTXISD::Suld2DArrayV4I16Clamp: |
383 | case NVPTXISD::Suld2DArrayV4I32Clamp: |
384 | case NVPTXISD::Suld3DI8Clamp: |
385 | case NVPTXISD::Suld3DI16Clamp: |
386 | case NVPTXISD::Suld3DI32Clamp: |
387 | case NVPTXISD::Suld3DI64Clamp: |
388 | case NVPTXISD::Suld3DV2I8Clamp: |
389 | case NVPTXISD::Suld3DV2I16Clamp: |
390 | case NVPTXISD::Suld3DV2I32Clamp: |
391 | case NVPTXISD::Suld3DV2I64Clamp: |
392 | case NVPTXISD::Suld3DV4I8Clamp: |
393 | case NVPTXISD::Suld3DV4I16Clamp: |
394 | case NVPTXISD::Suld3DV4I32Clamp: |
395 | case NVPTXISD::Suld1DI8Trap: |
396 | case NVPTXISD::Suld1DI16Trap: |
397 | case NVPTXISD::Suld1DI32Trap: |
398 | case NVPTXISD::Suld1DI64Trap: |
399 | case NVPTXISD::Suld1DV2I8Trap: |
400 | case NVPTXISD::Suld1DV2I16Trap: |
401 | case NVPTXISD::Suld1DV2I32Trap: |
402 | case NVPTXISD::Suld1DV2I64Trap: |
403 | case NVPTXISD::Suld1DV4I8Trap: |
404 | case NVPTXISD::Suld1DV4I16Trap: |
405 | case NVPTXISD::Suld1DV4I32Trap: |
406 | case NVPTXISD::Suld1DArrayI8Trap: |
407 | case NVPTXISD::Suld1DArrayI16Trap: |
408 | case NVPTXISD::Suld1DArrayI32Trap: |
409 | case NVPTXISD::Suld1DArrayI64Trap: |
410 | case NVPTXISD::Suld1DArrayV2I8Trap: |
411 | case NVPTXISD::Suld1DArrayV2I16Trap: |
412 | case NVPTXISD::Suld1DArrayV2I32Trap: |
413 | case NVPTXISD::Suld1DArrayV2I64Trap: |
414 | case NVPTXISD::Suld1DArrayV4I8Trap: |
415 | case NVPTXISD::Suld1DArrayV4I16Trap: |
416 | case NVPTXISD::Suld1DArrayV4I32Trap: |
417 | case NVPTXISD::Suld2DI8Trap: |
418 | case NVPTXISD::Suld2DI16Trap: |
419 | case NVPTXISD::Suld2DI32Trap: |
420 | case NVPTXISD::Suld2DI64Trap: |
421 | case NVPTXISD::Suld2DV2I8Trap: |
422 | case NVPTXISD::Suld2DV2I16Trap: |
423 | case NVPTXISD::Suld2DV2I32Trap: |
424 | case NVPTXISD::Suld2DV2I64Trap: |
425 | case NVPTXISD::Suld2DV4I8Trap: |
426 | case NVPTXISD::Suld2DV4I16Trap: |
427 | case NVPTXISD::Suld2DV4I32Trap: |
428 | case NVPTXISD::Suld2DArrayI8Trap: |
429 | case NVPTXISD::Suld2DArrayI16Trap: |
430 | case NVPTXISD::Suld2DArrayI32Trap: |
431 | case NVPTXISD::Suld2DArrayI64Trap: |
432 | case NVPTXISD::Suld2DArrayV2I8Trap: |
433 | case NVPTXISD::Suld2DArrayV2I16Trap: |
434 | case NVPTXISD::Suld2DArrayV2I32Trap: |
435 | case NVPTXISD::Suld2DArrayV2I64Trap: |
436 | case NVPTXISD::Suld2DArrayV4I8Trap: |
437 | case NVPTXISD::Suld2DArrayV4I16Trap: |
438 | case NVPTXISD::Suld2DArrayV4I32Trap: |
439 | case NVPTXISD::Suld3DI8Trap: |
440 | case NVPTXISD::Suld3DI16Trap: |
441 | case NVPTXISD::Suld3DI32Trap: |
442 | case NVPTXISD::Suld3DI64Trap: |
443 | case NVPTXISD::Suld3DV2I8Trap: |
444 | case NVPTXISD::Suld3DV2I16Trap: |
445 | case NVPTXISD::Suld3DV2I32Trap: |
446 | case NVPTXISD::Suld3DV2I64Trap: |
447 | case NVPTXISD::Suld3DV4I8Trap: |
448 | case NVPTXISD::Suld3DV4I16Trap: |
449 | case NVPTXISD::Suld3DV4I32Trap: |
450 | case NVPTXISD::Suld1DI8Zero: |
451 | case NVPTXISD::Suld1DI16Zero: |
452 | case NVPTXISD::Suld1DI32Zero: |
453 | case NVPTXISD::Suld1DI64Zero: |
454 | case NVPTXISD::Suld1DV2I8Zero: |
455 | case NVPTXISD::Suld1DV2I16Zero: |
456 | case NVPTXISD::Suld1DV2I32Zero: |
457 | case NVPTXISD::Suld1DV2I64Zero: |
458 | case NVPTXISD::Suld1DV4I8Zero: |
459 | case NVPTXISD::Suld1DV4I16Zero: |
460 | case NVPTXISD::Suld1DV4I32Zero: |
461 | case NVPTXISD::Suld1DArrayI8Zero: |
462 | case NVPTXISD::Suld1DArrayI16Zero: |
463 | case NVPTXISD::Suld1DArrayI32Zero: |
464 | case NVPTXISD::Suld1DArrayI64Zero: |
465 | case NVPTXISD::Suld1DArrayV2I8Zero: |
466 | case NVPTXISD::Suld1DArrayV2I16Zero: |
467 | case NVPTXISD::Suld1DArrayV2I32Zero: |
468 | case NVPTXISD::Suld1DArrayV2I64Zero: |
469 | case NVPTXISD::Suld1DArrayV4I8Zero: |
470 | case NVPTXISD::Suld1DArrayV4I16Zero: |
471 | case NVPTXISD::Suld1DArrayV4I32Zero: |
472 | case NVPTXISD::Suld2DI8Zero: |
473 | case NVPTXISD::Suld2DI16Zero: |
474 | case NVPTXISD::Suld2DI32Zero: |
475 | case NVPTXISD::Suld2DI64Zero: |
476 | case NVPTXISD::Suld2DV2I8Zero: |
477 | case NVPTXISD::Suld2DV2I16Zero: |
478 | case NVPTXISD::Suld2DV2I32Zero: |
479 | case NVPTXISD::Suld2DV2I64Zero: |
480 | case NVPTXISD::Suld2DV4I8Zero: |
481 | case NVPTXISD::Suld2DV4I16Zero: |
482 | case NVPTXISD::Suld2DV4I32Zero: |
483 | case NVPTXISD::Suld2DArrayI8Zero: |
484 | case NVPTXISD::Suld2DArrayI16Zero: |
485 | case NVPTXISD::Suld2DArrayI32Zero: |
486 | case NVPTXISD::Suld2DArrayI64Zero: |
487 | case NVPTXISD::Suld2DArrayV2I8Zero: |
488 | case NVPTXISD::Suld2DArrayV2I16Zero: |
489 | case NVPTXISD::Suld2DArrayV2I32Zero: |
490 | case NVPTXISD::Suld2DArrayV2I64Zero: |
491 | case NVPTXISD::Suld2DArrayV4I8Zero: |
492 | case NVPTXISD::Suld2DArrayV4I16Zero: |
493 | case NVPTXISD::Suld2DArrayV4I32Zero: |
494 | case NVPTXISD::Suld3DI8Zero: |
495 | case NVPTXISD::Suld3DI16Zero: |
496 | case NVPTXISD::Suld3DI32Zero: |
497 | case NVPTXISD::Suld3DI64Zero: |
498 | case NVPTXISD::Suld3DV2I8Zero: |
499 | case NVPTXISD::Suld3DV2I16Zero: |
500 | case NVPTXISD::Suld3DV2I32Zero: |
501 | case NVPTXISD::Suld3DV2I64Zero: |
502 | case NVPTXISD::Suld3DV4I8Zero: |
503 | case NVPTXISD::Suld3DV4I16Zero: |
504 | case NVPTXISD::Suld3DV4I32Zero: |
505 | if (trySurfaceIntrinsic(N)) |
506 | return; |
507 | break; |
508 | case ISD::AND: |
509 | case ISD::SRA: |
510 | case ISD::SRL: |
511 | // Try to select BFE |
512 | if (tryBFE(N)) |
513 | return; |
514 | break; |
515 | case ISD::ADDRSPACECAST: |
516 | SelectAddrSpaceCast(N); |
517 | return; |
518 | case ISD::ConstantFP: |
519 | if (tryConstantFP(N)) |
520 | return; |
521 | break; |
522 | case ISD::CopyToReg: { |
523 | if (N->getOperand(Num: 1).getValueType() == MVT::i128) { |
524 | SelectV2I64toI128(N); |
525 | return; |
526 | } |
527 | break; |
528 | } |
529 | case ISD::CopyFromReg: { |
530 | if (N->getOperand(Num: 1).getValueType() == MVT::i128) { |
531 | SelectI128toV2I64(N); |
532 | return; |
533 | } |
534 | break; |
535 | } |
536 | default: |
537 | break; |
538 | } |
539 | SelectCode(N); |
540 | } |
541 | |
542 | bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { |
543 | unsigned IID = N->getConstantOperandVal(Num: 1); |
544 | switch (IID) { |
545 | default: |
546 | return false; |
547 | case Intrinsic::nvvm_ldg_global_f: |
548 | case Intrinsic::nvvm_ldg_global_i: |
549 | case Intrinsic::nvvm_ldg_global_p: |
550 | case Intrinsic::nvvm_ldu_global_f: |
551 | case Intrinsic::nvvm_ldu_global_i: |
552 | case Intrinsic::nvvm_ldu_global_p: |
553 | return tryLDGLDU(N); |
554 | } |
555 | } |
556 | |
557 | // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we |
558 | // have to load them into an .(b)f16 register first. |
559 | bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) { |
560 | if (N->getValueType(ResNo: 0) != MVT::f16 && N->getValueType(ResNo: 0) != MVT::bf16) |
561 | return false; |
562 | SDValue Val = CurDAG->getTargetConstantFP( |
563 | Val: cast<ConstantFPSDNode>(Val: N)->getValueAPF(), DL: SDLoc(N), VT: N->getValueType(ResNo: 0)); |
564 | SDNode *LoadConstF16 = CurDAG->getMachineNode( |
565 | Opcode: (N->getValueType(ResNo: 0) == MVT::f16 ? NVPTX::LOAD_CONST_F16 |
566 | : NVPTX::LOAD_CONST_BF16), |
567 | dl: SDLoc(N), VT: N->getValueType(ResNo: 0), Op1: Val); |
568 | ReplaceNode(F: N, T: LoadConstF16); |
569 | return true; |
570 | } |
571 | |
572 | // Map ISD:CONDCODE value to appropriate CmpMode expected by |
573 | // NVPTXInstPrinter::printCmpMode() |
574 | static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { |
575 | using NVPTX::PTXCmpMode::CmpMode; |
576 | unsigned PTXCmpMode = [](ISD::CondCode CC) { |
577 | switch (CC) { |
578 | default: |
579 | llvm_unreachable("Unexpected condition code." ); |
580 | case ISD::SETOEQ: |
581 | return CmpMode::EQ; |
582 | case ISD::SETOGT: |
583 | return CmpMode::GT; |
584 | case ISD::SETOGE: |
585 | return CmpMode::GE; |
586 | case ISD::SETOLT: |
587 | return CmpMode::LT; |
588 | case ISD::SETOLE: |
589 | return CmpMode::LE; |
590 | case ISD::SETONE: |
591 | return CmpMode::NE; |
592 | case ISD::SETO: |
593 | return CmpMode::NUM; |
594 | case ISD::SETUO: |
595 | return CmpMode::NotANumber; |
596 | case ISD::SETUEQ: |
597 | return CmpMode::EQU; |
598 | case ISD::SETUGT: |
599 | return CmpMode::GTU; |
600 | case ISD::SETUGE: |
601 | return CmpMode::GEU; |
602 | case ISD::SETULT: |
603 | return CmpMode::LTU; |
604 | case ISD::SETULE: |
605 | return CmpMode::LEU; |
606 | case ISD::SETUNE: |
607 | return CmpMode::NEU; |
608 | case ISD::SETEQ: |
609 | return CmpMode::EQ; |
610 | case ISD::SETGT: |
611 | return CmpMode::GT; |
612 | case ISD::SETGE: |
613 | return CmpMode::GE; |
614 | case ISD::SETLT: |
615 | return CmpMode::LT; |
616 | case ISD::SETLE: |
617 | return CmpMode::LE; |
618 | case ISD::SETNE: |
619 | return CmpMode::NE; |
620 | } |
621 | }(CondCode.get()); |
622 | |
623 | if (FTZ) |
624 | PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG; |
625 | |
626 | return PTXCmpMode; |
627 | } |
628 | |
629 | bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { |
630 | unsigned PTXCmpMode = |
631 | getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)), FTZ: useF32FTZ()); |
632 | SDLoc DL(N); |
633 | SDNode *SetP = CurDAG->getMachineNode( |
634 | Opcode: NVPTX::SETP_f16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1, Op1: N->getOperand(Num: 0), |
635 | Op2: N->getOperand(Num: 1), Op3: CurDAG->getTargetConstant(Val: PTXCmpMode, DL, VT: MVT::i32)); |
636 | ReplaceNode(F: N, T: SetP); |
637 | return true; |
638 | } |
639 | |
640 | bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) { |
641 | unsigned PTXCmpMode = |
642 | getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)), FTZ: useF32FTZ()); |
643 | SDLoc DL(N); |
644 | SDNode *SetP = CurDAG->getMachineNode( |
645 | Opcode: NVPTX::SETP_bf16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1, Op1: N->getOperand(Num: 0), |
646 | Op2: N->getOperand(Num: 1), Op3: CurDAG->getTargetConstant(Val: PTXCmpMode, DL, VT: MVT::i32)); |
647 | ReplaceNode(F: N, T: SetP); |
648 | return true; |
649 | } |
650 | |
651 | // Find all instances of extract_vector_elt that use this v2f16 vector |
652 | // and coalesce them into a scattering move instruction. |
653 | bool NVPTXDAGToDAGISel::(SDNode *N) { |
654 | SDValue Vector = N->getOperand(Num: 0); |
655 | |
656 | // We only care about 16x2 as it's the only real vector type we |
657 | // need to deal with. |
658 | MVT VT = Vector.getSimpleValueType(); |
659 | if (!Isv2x16VT(VT)) |
660 | return false; |
661 | // Find and record all uses of this vector that extract element 0 or 1. |
662 | SmallVector<SDNode *, 4> E0, E1; |
663 | for (auto *U : Vector.getNode()->uses()) { |
664 | if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT) |
665 | continue; |
666 | if (U->getOperand(Num: 0) != Vector) |
667 | continue; |
668 | if (const ConstantSDNode *IdxConst = |
669 | dyn_cast<ConstantSDNode>(Val: U->getOperand(Num: 1))) { |
670 | if (IdxConst->getZExtValue() == 0) |
671 | E0.push_back(Elt: U); |
672 | else if (IdxConst->getZExtValue() == 1) |
673 | E1.push_back(Elt: U); |
674 | else |
675 | llvm_unreachable("Invalid vector index." ); |
676 | } |
677 | } |
678 | |
679 | // There's no point scattering f16x2 if we only ever access one |
680 | // element of it. |
681 | if (E0.empty() || E1.empty()) |
682 | return false; |
683 | |
684 | // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) |
685 | // into f16,f16 SplitF16x2(V) |
686 | MVT EltVT = VT.getVectorElementType(); |
687 | SDNode *ScatterOp = |
688 | CurDAG->getMachineNode(Opcode: NVPTX::I32toV2I16, dl: SDLoc(N), VT1: EltVT, VT2: EltVT, Ops: Vector); |
689 | for (auto *Node : E0) |
690 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 0)); |
691 | for (auto *Node : E1) |
692 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 1)); |
693 | |
694 | return true; |
695 | } |
696 | |
697 | static unsigned int getCodeAddrSpace(MemSDNode *N) { |
698 | const Value *Src = N->getMemOperand()->getValue(); |
699 | |
700 | if (!Src) |
701 | return NVPTX::PTXLdStInstCode::GENERIC; |
702 | |
703 | if (auto *PT = dyn_cast<PointerType>(Val: Src->getType())) { |
704 | switch (PT->getAddressSpace()) { |
705 | case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; |
706 | case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; |
707 | case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; |
708 | case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; |
709 | case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; |
710 | case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; |
711 | default: break; |
712 | } |
713 | } |
714 | return NVPTX::PTXLdStInstCode::GENERIC; |
715 | } |
716 | |
717 | static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, |
718 | unsigned CodeAddrSpace, MachineFunction *F) { |
719 | // We use ldg (i.e. ld.global.nc) for invariant loads from the global address |
720 | // space. |
721 | // |
722 | // We have two ways of identifying invariant loads: Loads may be explicitly |
723 | // marked as invariant, or we may infer them to be invariant. |
724 | // |
725 | // We currently infer invariance for loads from |
726 | // - constant global variables, and |
727 | // - kernel function pointer params that are noalias (i.e. __restrict) and |
728 | // never written to. |
729 | // |
730 | // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally |
731 | // not during the SelectionDAG phase). |
732 | // |
733 | // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for |
734 | // explicitly invariant loads because these are how clang tells us to use ldg |
735 | // when the user uses a builtin. |
736 | if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) |
737 | return false; |
738 | |
739 | if (N->isInvariant()) |
740 | return true; |
741 | |
742 | bool IsKernelFn = isKernelFunction(F->getFunction()); |
743 | |
744 | // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly |
745 | // because the former looks through phi nodes while the latter does not. We |
746 | // need to look through phi nodes to handle pointer induction variables. |
747 | SmallVector<const Value *, 8> Objs; |
748 | getUnderlyingObjects(V: N->getMemOperand()->getValue(), Objects&: Objs); |
749 | |
750 | return all_of(Range&: Objs, P: [&](const Value *V) { |
751 | if (auto *A = dyn_cast<const Argument>(Val: V)) |
752 | return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); |
753 | if (auto *GV = dyn_cast<const GlobalVariable>(Val: V)) |
754 | return GV->isConstant(); |
755 | return false; |
756 | }); |
757 | } |
758 | |
759 | bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { |
760 | unsigned IID = N->getConstantOperandVal(Num: 0); |
761 | switch (IID) { |
762 | default: |
763 | return false; |
764 | case Intrinsic::nvvm_texsurf_handle_internal: |
765 | SelectTexSurfHandle(N); |
766 | return true; |
767 | } |
768 | } |
769 | |
770 | void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { |
771 | // Op 0 is the intrinsic ID |
772 | SDValue Wrapper = N->getOperand(Num: 1); |
773 | SDValue GlobalVal = Wrapper.getOperand(i: 0); |
774 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: NVPTX::texsurf_handles, dl: SDLoc(N), |
775 | VT: MVT::i64, Op1: GlobalVal)); |
776 | } |
777 | |
778 | void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { |
779 | SDValue Src = N->getOperand(Num: 0); |
780 | AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(Val: N); |
781 | unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); |
782 | unsigned DstAddrSpace = CastN->getDestAddressSpace(); |
783 | assert(SrcAddrSpace != DstAddrSpace && |
784 | "addrspacecast must be between different address spaces" ); |
785 | |
786 | if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { |
787 | // Specific to generic |
788 | unsigned Opc; |
789 | switch (SrcAddrSpace) { |
790 | default: report_fatal_error(reason: "Bad address space in addrspacecast" ); |
791 | case ADDRESS_SPACE_GLOBAL: |
792 | Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global; |
793 | break; |
794 | case ADDRESS_SPACE_SHARED: |
795 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(AS: SrcAddrSpace) == 32 |
796 | ? NVPTX::cvta_shared_6432 |
797 | : NVPTX::cvta_shared_64) |
798 | : NVPTX::cvta_shared; |
799 | break; |
800 | case ADDRESS_SPACE_CONST: |
801 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(AS: SrcAddrSpace) == 32 |
802 | ? NVPTX::cvta_const_6432 |
803 | : NVPTX::cvta_const_64) |
804 | : NVPTX::cvta_const; |
805 | break; |
806 | case ADDRESS_SPACE_LOCAL: |
807 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(AS: SrcAddrSpace) == 32 |
808 | ? NVPTX::cvta_local_6432 |
809 | : NVPTX::cvta_local_64) |
810 | : NVPTX::cvta_local; |
811 | break; |
812 | } |
813 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: N->getValueType(ResNo: 0), |
814 | Op1: Src)); |
815 | return; |
816 | } else { |
817 | // Generic to specific |
818 | if (SrcAddrSpace != 0) |
819 | report_fatal_error(reason: "Cannot cast between two non-generic address spaces" ); |
820 | unsigned Opc; |
821 | switch (DstAddrSpace) { |
822 | default: report_fatal_error(reason: "Bad address space in addrspacecast" ); |
823 | case ADDRESS_SPACE_GLOBAL: |
824 | Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global; |
825 | break; |
826 | case ADDRESS_SPACE_SHARED: |
827 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(AS: DstAddrSpace) == 32 |
828 | ? NVPTX::cvta_to_shared_3264 |
829 | : NVPTX::cvta_to_shared_64) |
830 | : NVPTX::cvta_to_shared; |
831 | break; |
832 | case ADDRESS_SPACE_CONST: |
833 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(AS: DstAddrSpace) == 32 |
834 | ? NVPTX::cvta_to_const_3264 |
835 | : NVPTX::cvta_to_const_64) |
836 | : NVPTX::cvta_to_const; |
837 | break; |
838 | case ADDRESS_SPACE_LOCAL: |
839 | Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(AS: DstAddrSpace) == 32 |
840 | ? NVPTX::cvta_to_local_3264 |
841 | : NVPTX::cvta_to_local_64) |
842 | : NVPTX::cvta_to_local; |
843 | break; |
844 | case ADDRESS_SPACE_PARAM: |
845 | Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 |
846 | : NVPTX::nvvm_ptr_gen_to_param; |
847 | break; |
848 | } |
849 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VT: N->getValueType(ResNo: 0), |
850 | Op1: Src)); |
851 | return; |
852 | } |
853 | } |
854 | |
855 | // Helper function template to reduce amount of boilerplate code for |
856 | // opcode selection. |
857 | static std::optional<unsigned> |
858 | pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, |
859 | unsigned Opcode_i16, unsigned Opcode_i32, |
860 | std::optional<unsigned> Opcode_i64, unsigned Opcode_f32, |
861 | std::optional<unsigned> Opcode_f64) { |
862 | switch (VT) { |
863 | case MVT::i1: |
864 | case MVT::i8: |
865 | return Opcode_i8; |
866 | case MVT::i16: |
867 | return Opcode_i16; |
868 | case MVT::i32: |
869 | return Opcode_i32; |
870 | case MVT::i64: |
871 | return Opcode_i64; |
872 | case MVT::f16: |
873 | case MVT::bf16: |
874 | return Opcode_i16; |
875 | case MVT::v2f16: |
876 | case MVT::v2bf16: |
877 | case MVT::v2i16: |
878 | case MVT::v4i8: |
879 | return Opcode_i32; |
880 | case MVT::f32: |
881 | return Opcode_f32; |
882 | case MVT::f64: |
883 | return Opcode_f64; |
884 | default: |
885 | return std::nullopt; |
886 | } |
887 | } |
888 | |
889 | static int getLdStRegType(EVT VT) { |
890 | if (VT.isFloatingPoint()) |
891 | switch (VT.getSimpleVT().SimpleTy) { |
892 | case MVT::f16: |
893 | case MVT::bf16: |
894 | case MVT::v2f16: |
895 | case MVT::v2bf16: |
896 | return NVPTX::PTXLdStInstCode::Untyped; |
897 | default: |
898 | return NVPTX::PTXLdStInstCode::Float; |
899 | } |
900 | else |
901 | return NVPTX::PTXLdStInstCode::Unsigned; |
902 | } |
903 | |
904 | bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { |
905 | SDLoc dl(N); |
906 | MemSDNode *LD = cast<MemSDNode>(Val: N); |
907 | assert(LD->readMem() && "Expected load" ); |
908 | LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(Val: N); |
909 | EVT LoadedVT = LD->getMemoryVT(); |
910 | SDNode *NVPTXLD = nullptr; |
911 | |
912 | // do not support pre/post inc/dec |
913 | if (PlainLoad && PlainLoad->isIndexed()) |
914 | return false; |
915 | |
916 | if (!LoadedVT.isSimple()) |
917 | return false; |
918 | |
919 | AtomicOrdering Ordering = LD->getSuccessOrdering(); |
920 | // In order to lower atomic loads with stronger guarantees we would need to |
921 | // use load.acquire or insert fences. However these features were only added |
922 | // with PTX ISA 6.0 / sm_70. |
923 | // TODO: Check if we can actually use the new instructions and implement them. |
924 | if (isStrongerThanMonotonic(AO: Ordering)) |
925 | return false; |
926 | |
927 | // Address Space Setting |
928 | unsigned int CodeAddrSpace = getCodeAddrSpace(N: LD); |
929 | if (canLowerToLDG(N: LD, Subtarget: *Subtarget, CodeAddrSpace, F: MF)) { |
930 | return tryLDGLDU(N); |
931 | } |
932 | |
933 | unsigned int PointerSize = |
934 | CurDAG->getDataLayout().getPointerSizeInBits(AS: LD->getAddressSpace()); |
935 | |
936 | // Volatile Setting |
937 | // - .volatile is only available for .global and .shared |
938 | // - .volatile has the same memory synchronization semantics as .relaxed.sys |
939 | bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; |
940 | if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && |
941 | CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && |
942 | CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) |
943 | isVolatile = false; |
944 | |
945 | // Type Setting: fromType + fromTypeWidth |
946 | // |
947 | // Sign : ISD::SEXTLOAD |
948 | // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the |
949 | // type is integer |
950 | // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float |
951 | MVT SimpleVT = LoadedVT.getSimpleVT(); |
952 | MVT ScalarVT = SimpleVT.getScalarType(); |
953 | // Read at least 8 bits (predicates are stored as 8-bit values) |
954 | unsigned fromTypeWidth = std::max(a: 8U, b: (unsigned)ScalarVT.getSizeInBits()); |
955 | unsigned int fromType; |
956 | |
957 | // Vector Setting |
958 | unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; |
959 | if (SimpleVT.isVector()) { |
960 | assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && |
961 | "Unexpected vector type" ); |
962 | // v2f16/v2bf16/v2i16 is loaded using ld.b32 |
963 | fromTypeWidth = 32; |
964 | } |
965 | |
966 | if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) |
967 | fromType = NVPTX::PTXLdStInstCode::Signed; |
968 | else |
969 | fromType = getLdStRegType(VT: ScalarVT); |
970 | |
971 | // Create the machine instruction DAG |
972 | SDValue Chain = N->getOperand(Num: 0); |
973 | SDValue N1 = N->getOperand(Num: 1); |
974 | SDValue Addr; |
975 | SDValue Offset, Base; |
976 | std::optional<unsigned> Opcode; |
977 | MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy; |
978 | |
979 | if (SelectDirectAddr(N: N1, Address&: Addr)) { |
980 | Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i8: NVPTX::LD_i8_avar, Opcode_i16: NVPTX::LD_i16_avar, |
981 | Opcode_i32: NVPTX::LD_i32_avar, Opcode_i64: NVPTX::LD_i64_avar, |
982 | Opcode_f32: NVPTX::LD_f32_avar, Opcode_f64: NVPTX::LD_f64_avar); |
983 | if (!Opcode) |
984 | return false; |
985 | SDValue Ops[] = { getI32Imm(Imm: isVolatile, DL: dl), getI32Imm(Imm: CodeAddrSpace, DL: dl), |
986 | getI32Imm(Imm: vecType, DL: dl), getI32Imm(Imm: fromType, DL: dl), |
987 | getI32Imm(Imm: fromTypeWidth, DL: dl), Addr, Chain }; |
988 | NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl, VT1: TargetVT, VT2: MVT::Other, Ops); |
989 | } else if (PointerSize == 64 ? SelectADDRsi64(OpNode: N1.getNode(), Addr: N1, Base, Offset) |
990 | : SelectADDRsi(OpNode: N1.getNode(), Addr: N1, Base, Offset)) { |
991 | Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i8: NVPTX::LD_i8_asi, Opcode_i16: NVPTX::LD_i16_asi, |
992 | Opcode_i32: NVPTX::LD_i32_asi, Opcode_i64: NVPTX::LD_i64_asi, |
993 | Opcode_f32: NVPTX::LD_f32_asi, Opcode_f64: NVPTX::LD_f64_asi); |
994 | if (!Opcode) |
995 | return false; |
996 | SDValue Ops[] = { getI32Imm(Imm: isVolatile, DL: dl), getI32Imm(Imm: CodeAddrSpace, DL: dl), |
997 | getI32Imm(Imm: vecType, DL: dl), getI32Imm(Imm: fromType, DL: dl), |
998 | getI32Imm(Imm: fromTypeWidth, DL: dl), Base, Offset, Chain }; |
999 | NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl, VT1: TargetVT, VT2: MVT::Other, Ops); |
1000 | } else if (PointerSize == 64 ? SelectADDRri64(OpNode: N1.getNode(), Addr: N1, Base, Offset) |
1001 | : SelectADDRri(OpNode: N1.getNode(), Addr: N1, Base, Offset)) { |
1002 | if (PointerSize == 64) |
1003 | Opcode = |
1004 | pickOpcodeForVT(VT: TargetVT, Opcode_i8: NVPTX::LD_i8_ari_64, Opcode_i16: NVPTX::LD_i16_ari_64, |
1005 | Opcode_i32: NVPTX::LD_i32_ari_64, Opcode_i64: NVPTX::LD_i64_ari_64, |
1006 | Opcode_f32: NVPTX::LD_f32_ari_64, Opcode_f64: NVPTX::LD_f64_ari_64); |
1007 | else |
1008 | Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i8: NVPTX::LD_i8_ari, Opcode_i16: NVPTX::LD_i16_ari, |
1009 | Opcode_i32: NVPTX::LD_i32_ari, Opcode_i64: NVPTX::LD_i64_ari, |
1010 | Opcode_f32: NVPTX::LD_f32_ari, Opcode_f64: NVPTX::LD_f64_ari); |
1011 | if (!Opcode) |
1012 | return false; |
1013 | SDValue Ops[] = { getI32Imm(Imm: isVolatile, DL: dl), getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1014 | getI32Imm(Imm: vecType, DL: dl), getI32Imm(Imm: fromType, DL: dl), |
1015 | getI32Imm(Imm: fromTypeWidth, DL: dl), Base, Offset, Chain }; |
1016 | NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl, VT1: TargetVT, VT2: MVT::Other, Ops); |
1017 | } else { |
1018 | if (PointerSize == 64) |
1019 | Opcode = |
1020 | pickOpcodeForVT(VT: TargetVT, Opcode_i8: NVPTX::LD_i8_areg_64, Opcode_i16: NVPTX::LD_i16_areg_64, |
1021 | Opcode_i32: NVPTX::LD_i32_areg_64, Opcode_i64: NVPTX::LD_i64_areg_64, |
1022 | Opcode_f32: NVPTX::LD_f32_areg_64, Opcode_f64: NVPTX::LD_f64_areg_64); |
1023 | else |
1024 | Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i8: NVPTX::LD_i8_areg, Opcode_i16: NVPTX::LD_i16_areg, |
1025 | Opcode_i32: NVPTX::LD_i32_areg, Opcode_i64: NVPTX::LD_i64_areg, |
1026 | Opcode_f32: NVPTX::LD_f32_areg, Opcode_f64: NVPTX::LD_f64_areg); |
1027 | if (!Opcode) |
1028 | return false; |
1029 | SDValue Ops[] = { getI32Imm(Imm: isVolatile, DL: dl), getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1030 | getI32Imm(Imm: vecType, DL: dl), getI32Imm(Imm: fromType, DL: dl), |
1031 | getI32Imm(Imm: fromTypeWidth, DL: dl), N1, Chain }; |
1032 | NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl, VT1: TargetVT, VT2: MVT::Other, Ops); |
1033 | } |
1034 | |
1035 | if (!NVPTXLD) |
1036 | return false; |
1037 | |
1038 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
1039 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef}); |
1040 | |
1041 | ReplaceNode(F: N, T: NVPTXLD); |
1042 | return true; |
1043 | } |
1044 | |
1045 | bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { |
1046 | |
1047 | SDValue Chain = N->getOperand(Num: 0); |
1048 | SDValue Op1 = N->getOperand(Num: 1); |
1049 | SDValue Addr, Offset, Base; |
1050 | std::optional<unsigned> Opcode; |
1051 | SDLoc DL(N); |
1052 | SDNode *LD; |
1053 | MemSDNode *MemSD = cast<MemSDNode>(Val: N); |
1054 | EVT LoadedVT = MemSD->getMemoryVT(); |
1055 | |
1056 | if (!LoadedVT.isSimple()) |
1057 | return false; |
1058 | |
1059 | // Address Space Setting |
1060 | unsigned int CodeAddrSpace = getCodeAddrSpace(N: MemSD); |
1061 | if (canLowerToLDG(N: MemSD, Subtarget: *Subtarget, CodeAddrSpace, F: MF)) { |
1062 | return tryLDGLDU(N); |
1063 | } |
1064 | |
1065 | unsigned int PointerSize = |
1066 | CurDAG->getDataLayout().getPointerSizeInBits(AS: MemSD->getAddressSpace()); |
1067 | |
1068 | // Volatile Setting |
1069 | // - .volatile is only availalble for .global and .shared |
1070 | bool IsVolatile = MemSD->isVolatile(); |
1071 | if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && |
1072 | CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && |
1073 | CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) |
1074 | IsVolatile = false; |
1075 | |
1076 | // Vector Setting |
1077 | MVT SimpleVT = LoadedVT.getSimpleVT(); |
1078 | |
1079 | // Type Setting: fromType + fromTypeWidth |
1080 | // |
1081 | // Sign : ISD::SEXTLOAD |
1082 | // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the |
1083 | // type is integer |
1084 | // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float |
1085 | MVT ScalarVT = SimpleVT.getScalarType(); |
1086 | // Read at least 8 bits (predicates are stored as 8-bit values) |
1087 | unsigned FromTypeWidth = std::max(a: 8U, b: (unsigned)ScalarVT.getSizeInBits()); |
1088 | unsigned int FromType; |
1089 | // The last operand holds the original LoadSDNode::getExtensionType() value |
1090 | unsigned ExtensionType = cast<ConstantSDNode>( |
1091 | Val: N->getOperand(Num: N->getNumOperands() - 1))->getZExtValue(); |
1092 | if (ExtensionType == ISD::SEXTLOAD) |
1093 | FromType = NVPTX::PTXLdStInstCode::Signed; |
1094 | else |
1095 | FromType = getLdStRegType(VT: ScalarVT); |
1096 | |
1097 | unsigned VecType; |
1098 | |
1099 | switch (N->getOpcode()) { |
1100 | case NVPTXISD::LoadV2: |
1101 | VecType = NVPTX::PTXLdStInstCode::V2; |
1102 | break; |
1103 | case NVPTXISD::LoadV4: |
1104 | VecType = NVPTX::PTXLdStInstCode::V4; |
1105 | break; |
1106 | default: |
1107 | return false; |
1108 | } |
1109 | |
1110 | EVT EltVT = N->getValueType(ResNo: 0); |
1111 | |
1112 | // v8x16 is a special case. PTX doesn't have ld.v8.16 |
1113 | // instruction. Instead, we split the vector into v2x16 chunks and |
1114 | // load them with ld.v4.b32. |
1115 | if (Isv2x16VT(VT: EltVT)) { |
1116 | assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode." ); |
1117 | EltVT = MVT::i32; |
1118 | FromType = NVPTX::PTXLdStInstCode::Untyped; |
1119 | FromTypeWidth = 32; |
1120 | } |
1121 | |
1122 | if (SelectDirectAddr(N: Op1, Address&: Addr)) { |
1123 | switch (N->getOpcode()) { |
1124 | default: |
1125 | return false; |
1126 | case NVPTXISD::LoadV2: |
1127 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1128 | Opcode_i8: NVPTX::LDV_i8_v2_avar, Opcode_i16: NVPTX::LDV_i16_v2_avar, |
1129 | Opcode_i32: NVPTX::LDV_i32_v2_avar, Opcode_i64: NVPTX::LDV_i64_v2_avar, |
1130 | Opcode_f32: NVPTX::LDV_f32_v2_avar, Opcode_f64: NVPTX::LDV_f64_v2_avar); |
1131 | break; |
1132 | case NVPTXISD::LoadV4: |
1133 | Opcode = |
1134 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LDV_i8_v4_avar, |
1135 | Opcode_i16: NVPTX::LDV_i16_v4_avar, Opcode_i32: NVPTX::LDV_i32_v4_avar, |
1136 | Opcode_i64: std::nullopt, Opcode_f32: NVPTX::LDV_f32_v4_avar, Opcode_f64: std::nullopt); |
1137 | break; |
1138 | } |
1139 | if (!Opcode) |
1140 | return false; |
1141 | SDValue Ops[] = { getI32Imm(Imm: IsVolatile, DL), getI32Imm(Imm: CodeAddrSpace, DL), |
1142 | getI32Imm(Imm: VecType, DL), getI32Imm(Imm: FromType, DL), |
1143 | getI32Imm(Imm: FromTypeWidth, DL), Addr, Chain }; |
1144 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: N->getVTList(), Ops); |
1145 | } else if (PointerSize == 64 |
1146 | ? SelectADDRsi64(OpNode: Op1.getNode(), Addr: Op1, Base, Offset) |
1147 | : SelectADDRsi(OpNode: Op1.getNode(), Addr: Op1, Base, Offset)) { |
1148 | switch (N->getOpcode()) { |
1149 | default: |
1150 | return false; |
1151 | case NVPTXISD::LoadV2: |
1152 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1153 | Opcode_i8: NVPTX::LDV_i8_v2_asi, Opcode_i16: NVPTX::LDV_i16_v2_asi, |
1154 | Opcode_i32: NVPTX::LDV_i32_v2_asi, Opcode_i64: NVPTX::LDV_i64_v2_asi, |
1155 | Opcode_f32: NVPTX::LDV_f32_v2_asi, Opcode_f64: NVPTX::LDV_f64_v2_asi); |
1156 | break; |
1157 | case NVPTXISD::LoadV4: |
1158 | Opcode = |
1159 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LDV_i8_v4_asi, |
1160 | Opcode_i16: NVPTX::LDV_i16_v4_asi, Opcode_i32: NVPTX::LDV_i32_v4_asi, |
1161 | Opcode_i64: std::nullopt, Opcode_f32: NVPTX::LDV_f32_v4_asi, Opcode_f64: std::nullopt); |
1162 | break; |
1163 | } |
1164 | if (!Opcode) |
1165 | return false; |
1166 | SDValue Ops[] = { getI32Imm(Imm: IsVolatile, DL), getI32Imm(Imm: CodeAddrSpace, DL), |
1167 | getI32Imm(Imm: VecType, DL), getI32Imm(Imm: FromType, DL), |
1168 | getI32Imm(Imm: FromTypeWidth, DL), Base, Offset, Chain }; |
1169 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: N->getVTList(), Ops); |
1170 | } else if (PointerSize == 64 |
1171 | ? SelectADDRri64(OpNode: Op1.getNode(), Addr: Op1, Base, Offset) |
1172 | : SelectADDRri(OpNode: Op1.getNode(), Addr: Op1, Base, Offset)) { |
1173 | if (PointerSize == 64) { |
1174 | switch (N->getOpcode()) { |
1175 | default: |
1176 | return false; |
1177 | case NVPTXISD::LoadV2: |
1178 | Opcode = |
1179 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1180 | Opcode_i8: NVPTX::LDV_i8_v2_ari_64, Opcode_i16: NVPTX::LDV_i16_v2_ari_64, |
1181 | Opcode_i32: NVPTX::LDV_i32_v2_ari_64, Opcode_i64: NVPTX::LDV_i64_v2_ari_64, |
1182 | Opcode_f32: NVPTX::LDV_f32_v2_ari_64, Opcode_f64: NVPTX::LDV_f64_v2_ari_64); |
1183 | break; |
1184 | case NVPTXISD::LoadV4: |
1185 | Opcode = pickOpcodeForVT( |
1186 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LDV_i8_v4_ari_64, |
1187 | Opcode_i16: NVPTX::LDV_i16_v4_ari_64, Opcode_i32: NVPTX::LDV_i32_v4_ari_64, Opcode_i64: std::nullopt, |
1188 | Opcode_f32: NVPTX::LDV_f32_v4_ari_64, Opcode_f64: std::nullopt); |
1189 | break; |
1190 | } |
1191 | } else { |
1192 | switch (N->getOpcode()) { |
1193 | default: |
1194 | return false; |
1195 | case NVPTXISD::LoadV2: |
1196 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1197 | Opcode_i8: NVPTX::LDV_i8_v2_ari, Opcode_i16: NVPTX::LDV_i16_v2_ari, |
1198 | Opcode_i32: NVPTX::LDV_i32_v2_ari, Opcode_i64: NVPTX::LDV_i64_v2_ari, |
1199 | Opcode_f32: NVPTX::LDV_f32_v2_ari, Opcode_f64: NVPTX::LDV_f64_v2_ari); |
1200 | break; |
1201 | case NVPTXISD::LoadV4: |
1202 | Opcode = |
1203 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LDV_i8_v4_ari, |
1204 | Opcode_i16: NVPTX::LDV_i16_v4_ari, Opcode_i32: NVPTX::LDV_i32_v4_ari, |
1205 | Opcode_i64: std::nullopt, Opcode_f32: NVPTX::LDV_f32_v4_ari, Opcode_f64: std::nullopt); |
1206 | break; |
1207 | } |
1208 | } |
1209 | if (!Opcode) |
1210 | return false; |
1211 | SDValue Ops[] = { getI32Imm(Imm: IsVolatile, DL), getI32Imm(Imm: CodeAddrSpace, DL), |
1212 | getI32Imm(Imm: VecType, DL), getI32Imm(Imm: FromType, DL), |
1213 | getI32Imm(Imm: FromTypeWidth, DL), Base, Offset, Chain }; |
1214 | |
1215 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: N->getVTList(), Ops); |
1216 | } else { |
1217 | if (PointerSize == 64) { |
1218 | switch (N->getOpcode()) { |
1219 | default: |
1220 | return false; |
1221 | case NVPTXISD::LoadV2: |
1222 | Opcode = pickOpcodeForVT( |
1223 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LDV_i8_v2_areg_64, |
1224 | Opcode_i16: NVPTX::LDV_i16_v2_areg_64, Opcode_i32: NVPTX::LDV_i32_v2_areg_64, |
1225 | Opcode_i64: NVPTX::LDV_i64_v2_areg_64, Opcode_f32: NVPTX::LDV_f32_v2_areg_64, |
1226 | Opcode_f64: NVPTX::LDV_f64_v2_areg_64); |
1227 | break; |
1228 | case NVPTXISD::LoadV4: |
1229 | Opcode = pickOpcodeForVT( |
1230 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LDV_i8_v4_areg_64, |
1231 | Opcode_i16: NVPTX::LDV_i16_v4_areg_64, Opcode_i32: NVPTX::LDV_i32_v4_areg_64, Opcode_i64: std::nullopt, |
1232 | Opcode_f32: NVPTX::LDV_f32_v4_areg_64, Opcode_f64: std::nullopt); |
1233 | break; |
1234 | } |
1235 | } else { |
1236 | switch (N->getOpcode()) { |
1237 | default: |
1238 | return false; |
1239 | case NVPTXISD::LoadV2: |
1240 | Opcode = |
1241 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LDV_i8_v2_areg, |
1242 | Opcode_i16: NVPTX::LDV_i16_v2_areg, Opcode_i32: NVPTX::LDV_i32_v2_areg, |
1243 | Opcode_i64: NVPTX::LDV_i64_v2_areg, Opcode_f32: NVPTX::LDV_f32_v2_areg, |
1244 | Opcode_f64: NVPTX::LDV_f64_v2_areg); |
1245 | break; |
1246 | case NVPTXISD::LoadV4: |
1247 | Opcode = |
1248 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LDV_i8_v4_areg, |
1249 | Opcode_i16: NVPTX::LDV_i16_v4_areg, Opcode_i32: NVPTX::LDV_i32_v4_areg, |
1250 | Opcode_i64: std::nullopt, Opcode_f32: NVPTX::LDV_f32_v4_areg, Opcode_f64: std::nullopt); |
1251 | break; |
1252 | } |
1253 | } |
1254 | if (!Opcode) |
1255 | return false; |
1256 | SDValue Ops[] = { getI32Imm(Imm: IsVolatile, DL), getI32Imm(Imm: CodeAddrSpace, DL), |
1257 | getI32Imm(Imm: VecType, DL), getI32Imm(Imm: FromType, DL), |
1258 | getI32Imm(Imm: FromTypeWidth, DL), Op1, Chain }; |
1259 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: N->getVTList(), Ops); |
1260 | } |
1261 | |
1262 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
1263 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: LD), NewMemRefs: {MemRef}); |
1264 | |
1265 | ReplaceNode(F: N, T: LD); |
1266 | return true; |
1267 | } |
1268 | |
1269 | bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { |
1270 | |
1271 | SDValue Chain = N->getOperand(Num: 0); |
1272 | SDValue Op1; |
1273 | MemSDNode *Mem; |
1274 | bool IsLDG = true; |
1275 | |
1276 | // If this is an LDG intrinsic, the address is the third operand. If its an |
1277 | // LDG/LDU SD node (from custom vector handling), then its the second operand |
1278 | if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { |
1279 | Op1 = N->getOperand(Num: 2); |
1280 | Mem = cast<MemIntrinsicSDNode>(Val: N); |
1281 | unsigned IID = N->getConstantOperandVal(Num: 1); |
1282 | switch (IID) { |
1283 | default: |
1284 | return false; |
1285 | case Intrinsic::nvvm_ldg_global_f: |
1286 | case Intrinsic::nvvm_ldg_global_i: |
1287 | case Intrinsic::nvvm_ldg_global_p: |
1288 | IsLDG = true; |
1289 | break; |
1290 | case Intrinsic::nvvm_ldu_global_f: |
1291 | case Intrinsic::nvvm_ldu_global_i: |
1292 | case Intrinsic::nvvm_ldu_global_p: |
1293 | IsLDG = false; |
1294 | break; |
1295 | } |
1296 | } else { |
1297 | Op1 = N->getOperand(Num: 1); |
1298 | Mem = cast<MemSDNode>(Val: N); |
1299 | } |
1300 | |
1301 | std::optional<unsigned> Opcode; |
1302 | SDLoc DL(N); |
1303 | SDNode *LD; |
1304 | SDValue Base, Offset, Addr; |
1305 | EVT OrigType = N->getValueType(ResNo: 0); |
1306 | |
1307 | EVT EltVT = Mem->getMemoryVT(); |
1308 | unsigned NumElts = 1; |
1309 | if (EltVT.isVector()) { |
1310 | NumElts = EltVT.getVectorNumElements(); |
1311 | EltVT = EltVT.getVectorElementType(); |
1312 | // vectors of 16bits type are loaded/stored as multiples of v2x16 elements. |
1313 | if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) || |
1314 | (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) || |
1315 | (EltVT == MVT::i16 && OrigType == MVT::v2i16)) { |
1316 | assert(NumElts % 2 == 0 && "Vector must have even number of elements" ); |
1317 | EltVT = OrigType; |
1318 | NumElts /= 2; |
1319 | } else if (OrigType == MVT::v4i8) { |
1320 | EltVT = OrigType; |
1321 | NumElts = 1; |
1322 | } |
1323 | } |
1324 | |
1325 | // Build the "promoted" result VTList for the load. If we are really loading |
1326 | // i8s, then the return type will be promoted to i16 since we do not expose |
1327 | // 8-bit registers in NVPTX. |
1328 | EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; |
1329 | SmallVector<EVT, 5> InstVTs; |
1330 | for (unsigned i = 0; i != NumElts; ++i) { |
1331 | InstVTs.push_back(Elt: NodeVT); |
1332 | } |
1333 | InstVTs.push_back(Elt: MVT::Other); |
1334 | SDVTList InstVTList = CurDAG->getVTList(VTs: InstVTs); |
1335 | |
1336 | if (SelectDirectAddr(N: Op1, Address&: Addr)) { |
1337 | switch (N->getOpcode()) { |
1338 | default: |
1339 | return false; |
1340 | case ISD::LOAD: |
1341 | case ISD::INTRINSIC_W_CHAIN: |
1342 | if (IsLDG) |
1343 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1344 | Opcode_i8: NVPTX::INT_PTX_LDG_GLOBAL_i8avar, |
1345 | Opcode_i16: NVPTX::INT_PTX_LDG_GLOBAL_i16avar, |
1346 | Opcode_i32: NVPTX::INT_PTX_LDG_GLOBAL_i32avar, |
1347 | Opcode_i64: NVPTX::INT_PTX_LDG_GLOBAL_i64avar, |
1348 | Opcode_f32: NVPTX::INT_PTX_LDG_GLOBAL_f32avar, |
1349 | Opcode_f64: NVPTX::INT_PTX_LDG_GLOBAL_f64avar); |
1350 | else |
1351 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1352 | Opcode_i8: NVPTX::INT_PTX_LDU_GLOBAL_i8avar, |
1353 | Opcode_i16: NVPTX::INT_PTX_LDU_GLOBAL_i16avar, |
1354 | Opcode_i32: NVPTX::INT_PTX_LDU_GLOBAL_i32avar, |
1355 | Opcode_i64: NVPTX::INT_PTX_LDU_GLOBAL_i64avar, |
1356 | Opcode_f32: NVPTX::INT_PTX_LDU_GLOBAL_f32avar, |
1357 | Opcode_f64: NVPTX::INT_PTX_LDU_GLOBAL_f64avar); |
1358 | break; |
1359 | case NVPTXISD::LoadV2: |
1360 | case NVPTXISD::LDGV2: |
1361 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1362 | Opcode_i8: NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar, |
1363 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar, |
1364 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar, |
1365 | Opcode_i64: NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar, |
1366 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar, |
1367 | Opcode_f64: NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar); |
1368 | break; |
1369 | case NVPTXISD::LDUV2: |
1370 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1371 | Opcode_i8: NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar, |
1372 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar, |
1373 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar, |
1374 | Opcode_i64: NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar, |
1375 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar, |
1376 | Opcode_f64: NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar); |
1377 | break; |
1378 | case NVPTXISD::LoadV4: |
1379 | case NVPTXISD::LDGV4: |
1380 | Opcode = pickOpcodeForVT( |
1381 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar, |
1382 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar, |
1383 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, Opcode_i64: std::nullopt, |
1384 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, Opcode_f64: std::nullopt); |
1385 | break; |
1386 | case NVPTXISD::LDUV4: |
1387 | Opcode = pickOpcodeForVT( |
1388 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar, |
1389 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar, |
1390 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, Opcode_i64: std::nullopt, |
1391 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, Opcode_f64: std::nullopt); |
1392 | break; |
1393 | } |
1394 | if (!Opcode) |
1395 | return false; |
1396 | SDValue Ops[] = { Addr, Chain }; |
1397 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: InstVTList, Ops); |
1398 | } else if (TM.is64Bit() ? SelectADDRri64(OpNode: Op1.getNode(), Addr: Op1, Base, Offset) |
1399 | : SelectADDRri(OpNode: Op1.getNode(), Addr: Op1, Base, Offset)) { |
1400 | if (TM.is64Bit()) { |
1401 | switch (N->getOpcode()) { |
1402 | default: |
1403 | return false; |
1404 | case ISD::LOAD: |
1405 | case ISD::INTRINSIC_W_CHAIN: |
1406 | if (IsLDG) |
1407 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1408 | Opcode_i8: NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, |
1409 | Opcode_i16: NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, |
1410 | Opcode_i32: NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, |
1411 | Opcode_i64: NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, |
1412 | Opcode_f32: NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, |
1413 | Opcode_f64: NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); |
1414 | else |
1415 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1416 | Opcode_i8: NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, |
1417 | Opcode_i16: NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, |
1418 | Opcode_i32: NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, |
1419 | Opcode_i64: NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, |
1420 | Opcode_f32: NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, |
1421 | Opcode_f64: NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); |
1422 | break; |
1423 | case NVPTXISD::LoadV2: |
1424 | case NVPTXISD::LDGV2: |
1425 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1426 | Opcode_i8: NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64, |
1427 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64, |
1428 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64, |
1429 | Opcode_i64: NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64, |
1430 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64, |
1431 | Opcode_f64: NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64); |
1432 | break; |
1433 | case NVPTXISD::LDUV2: |
1434 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1435 | Opcode_i8: NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64, |
1436 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64, |
1437 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64, |
1438 | Opcode_i64: NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64, |
1439 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64, |
1440 | Opcode_f64: NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64); |
1441 | break; |
1442 | case NVPTXISD::LoadV4: |
1443 | case NVPTXISD::LDGV4: |
1444 | Opcode = pickOpcodeForVT( |
1445 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64, |
1446 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64, |
1447 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, Opcode_i64: std::nullopt, |
1448 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, Opcode_f64: std::nullopt); |
1449 | break; |
1450 | case NVPTXISD::LDUV4: |
1451 | Opcode = pickOpcodeForVT( |
1452 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64, |
1453 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64, |
1454 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, Opcode_i64: std::nullopt, |
1455 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, Opcode_f64: std::nullopt); |
1456 | break; |
1457 | } |
1458 | } else { |
1459 | switch (N->getOpcode()) { |
1460 | default: |
1461 | return false; |
1462 | case ISD::LOAD: |
1463 | case ISD::INTRINSIC_W_CHAIN: |
1464 | if (IsLDG) |
1465 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1466 | Opcode_i8: NVPTX::INT_PTX_LDG_GLOBAL_i8ari, |
1467 | Opcode_i16: NVPTX::INT_PTX_LDG_GLOBAL_i16ari, |
1468 | Opcode_i32: NVPTX::INT_PTX_LDG_GLOBAL_i32ari, |
1469 | Opcode_i64: NVPTX::INT_PTX_LDG_GLOBAL_i64ari, |
1470 | Opcode_f32: NVPTX::INT_PTX_LDG_GLOBAL_f32ari, |
1471 | Opcode_f64: NVPTX::INT_PTX_LDG_GLOBAL_f64ari); |
1472 | else |
1473 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1474 | Opcode_i8: NVPTX::INT_PTX_LDU_GLOBAL_i8ari, |
1475 | Opcode_i16: NVPTX::INT_PTX_LDU_GLOBAL_i16ari, |
1476 | Opcode_i32: NVPTX::INT_PTX_LDU_GLOBAL_i32ari, |
1477 | Opcode_i64: NVPTX::INT_PTX_LDU_GLOBAL_i64ari, |
1478 | Opcode_f32: NVPTX::INT_PTX_LDU_GLOBAL_f32ari, |
1479 | Opcode_f64: NVPTX::INT_PTX_LDU_GLOBAL_f64ari); |
1480 | break; |
1481 | case NVPTXISD::LoadV2: |
1482 | case NVPTXISD::LDGV2: |
1483 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1484 | Opcode_i8: NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32, |
1485 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32, |
1486 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32, |
1487 | Opcode_i64: NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32, |
1488 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32, |
1489 | Opcode_f64: NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32); |
1490 | break; |
1491 | case NVPTXISD::LDUV2: |
1492 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1493 | Opcode_i8: NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32, |
1494 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32, |
1495 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32, |
1496 | Opcode_i64: NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32, |
1497 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32, |
1498 | Opcode_f64: NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32); |
1499 | break; |
1500 | case NVPTXISD::LoadV4: |
1501 | case NVPTXISD::LDGV4: |
1502 | Opcode = pickOpcodeForVT( |
1503 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32, |
1504 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32, |
1505 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, Opcode_i64: std::nullopt, |
1506 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, Opcode_f64: std::nullopt); |
1507 | break; |
1508 | case NVPTXISD::LDUV4: |
1509 | Opcode = pickOpcodeForVT( |
1510 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32, |
1511 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32, |
1512 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, Opcode_i64: std::nullopt, |
1513 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, Opcode_f64: std::nullopt); |
1514 | break; |
1515 | } |
1516 | } |
1517 | if (!Opcode) |
1518 | return false; |
1519 | SDValue Ops[] = {Base, Offset, Chain}; |
1520 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: InstVTList, Ops); |
1521 | } else { |
1522 | if (TM.is64Bit()) { |
1523 | switch (N->getOpcode()) { |
1524 | default: |
1525 | return false; |
1526 | case ISD::LOAD: |
1527 | case ISD::INTRINSIC_W_CHAIN: |
1528 | if (IsLDG) |
1529 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1530 | Opcode_i8: NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, |
1531 | Opcode_i16: NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, |
1532 | Opcode_i32: NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, |
1533 | Opcode_i64: NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, |
1534 | Opcode_f32: NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, |
1535 | Opcode_f64: NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); |
1536 | else |
1537 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1538 | Opcode_i8: NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, |
1539 | Opcode_i16: NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, |
1540 | Opcode_i32: NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, |
1541 | Opcode_i64: NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, |
1542 | Opcode_f32: NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, |
1543 | Opcode_f64: NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); |
1544 | break; |
1545 | case NVPTXISD::LoadV2: |
1546 | case NVPTXISD::LDGV2: |
1547 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1548 | Opcode_i8: NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64, |
1549 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64, |
1550 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64, |
1551 | Opcode_i64: NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64, |
1552 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64, |
1553 | Opcode_f64: NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64); |
1554 | break; |
1555 | case NVPTXISD::LDUV2: |
1556 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1557 | Opcode_i8: NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64, |
1558 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64, |
1559 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64, |
1560 | Opcode_i64: NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64, |
1561 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64, |
1562 | Opcode_f64: NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64); |
1563 | break; |
1564 | case NVPTXISD::LoadV4: |
1565 | case NVPTXISD::LDGV4: |
1566 | Opcode = pickOpcodeForVT( |
1567 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64, |
1568 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64, |
1569 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, Opcode_i64: std::nullopt, |
1570 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, Opcode_f64: std::nullopt); |
1571 | break; |
1572 | case NVPTXISD::LDUV4: |
1573 | Opcode = pickOpcodeForVT( |
1574 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64, |
1575 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64, |
1576 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, Opcode_i64: std::nullopt, |
1577 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, Opcode_f64: std::nullopt); |
1578 | break; |
1579 | } |
1580 | } else { |
1581 | switch (N->getOpcode()) { |
1582 | default: |
1583 | return false; |
1584 | case ISD::LOAD: |
1585 | case ISD::INTRINSIC_W_CHAIN: |
1586 | if (IsLDG) |
1587 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1588 | Opcode_i8: NVPTX::INT_PTX_LDG_GLOBAL_i8areg, |
1589 | Opcode_i16: NVPTX::INT_PTX_LDG_GLOBAL_i16areg, |
1590 | Opcode_i32: NVPTX::INT_PTX_LDG_GLOBAL_i32areg, |
1591 | Opcode_i64: NVPTX::INT_PTX_LDG_GLOBAL_i64areg, |
1592 | Opcode_f32: NVPTX::INT_PTX_LDG_GLOBAL_f32areg, |
1593 | Opcode_f64: NVPTX::INT_PTX_LDG_GLOBAL_f64areg); |
1594 | else |
1595 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1596 | Opcode_i8: NVPTX::INT_PTX_LDU_GLOBAL_i8areg, |
1597 | Opcode_i16: NVPTX::INT_PTX_LDU_GLOBAL_i16areg, |
1598 | Opcode_i32: NVPTX::INT_PTX_LDU_GLOBAL_i32areg, |
1599 | Opcode_i64: NVPTX::INT_PTX_LDU_GLOBAL_i64areg, |
1600 | Opcode_f32: NVPTX::INT_PTX_LDU_GLOBAL_f32areg, |
1601 | Opcode_f64: NVPTX::INT_PTX_LDU_GLOBAL_f64areg); |
1602 | break; |
1603 | case NVPTXISD::LoadV2: |
1604 | case NVPTXISD::LDGV2: |
1605 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1606 | Opcode_i8: NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32, |
1607 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32, |
1608 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32, |
1609 | Opcode_i64: NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32, |
1610 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32, |
1611 | Opcode_f64: NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32); |
1612 | break; |
1613 | case NVPTXISD::LDUV2: |
1614 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1615 | Opcode_i8: NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32, |
1616 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32, |
1617 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32, |
1618 | Opcode_i64: NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32, |
1619 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32, |
1620 | Opcode_f64: NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32); |
1621 | break; |
1622 | case NVPTXISD::LoadV4: |
1623 | case NVPTXISD::LDGV4: |
1624 | Opcode = pickOpcodeForVT( |
1625 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32, |
1626 | Opcode_i16: NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32, |
1627 | Opcode_i32: NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, Opcode_i64: std::nullopt, |
1628 | Opcode_f32: NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, Opcode_f64: std::nullopt); |
1629 | break; |
1630 | case NVPTXISD::LDUV4: |
1631 | Opcode = pickOpcodeForVT( |
1632 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32, |
1633 | Opcode_i16: NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32, |
1634 | Opcode_i32: NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, Opcode_i64: std::nullopt, |
1635 | Opcode_f32: NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, Opcode_f64: std::nullopt); |
1636 | break; |
1637 | } |
1638 | } |
1639 | if (!Opcode) |
1640 | return false; |
1641 | SDValue Ops[] = { Op1, Chain }; |
1642 | LD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: InstVTList, Ops); |
1643 | } |
1644 | |
1645 | // For automatic generation of LDG (through SelectLoad[Vector], not the |
1646 | // intrinsics), we may have an extending load like: |
1647 | // |
1648 | // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64 |
1649 | // |
1650 | // In this case, the matching logic above will select a load for the original |
1651 | // memory type (in this case, i8) and our types will not match (the node needs |
1652 | // to return an i32 in this case). Our LDG/LDU nodes do not support the |
1653 | // concept of sign-/zero-extension, so emulate it here by adding an explicit |
1654 | // CVT instruction. Ptxas should clean up any redundancies here. |
1655 | |
1656 | LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Val: N); |
1657 | |
1658 | if (OrigType != EltVT && |
1659 | (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) { |
1660 | // We have an extending-load. The instruction we selected operates on the |
1661 | // smaller type, but the SDNode we are replacing has the larger type. We |
1662 | // need to emit a CVT to make the types match. |
1663 | unsigned CvtOpc = |
1664 | GetConvertOpcode(DestTy: OrigType.getSimpleVT(), SrcTy: EltVT.getSimpleVT(), N: LdNode); |
1665 | |
1666 | // For each output value, apply the manual sign/zero-extension and make sure |
1667 | // all users of the load go through that CVT. |
1668 | for (unsigned i = 0; i != NumElts; ++i) { |
1669 | SDValue Res(LD, i); |
1670 | SDValue OrigVal(N, i); |
1671 | |
1672 | SDNode *CvtNode = |
1673 | CurDAG->getMachineNode(Opcode: CvtOpc, dl: DL, VT: OrigType, Op1: Res, |
1674 | Op2: CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, |
1675 | DL, VT: MVT::i32)); |
1676 | ReplaceUses(F: OrigVal, T: SDValue(CvtNode, 0)); |
1677 | } |
1678 | } |
1679 | |
1680 | ReplaceNode(F: N, T: LD); |
1681 | return true; |
1682 | } |
1683 | |
1684 | bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { |
1685 | SDLoc dl(N); |
1686 | MemSDNode *ST = cast<MemSDNode>(Val: N); |
1687 | assert(ST->writeMem() && "Expected store" ); |
1688 | StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(Val: N); |
1689 | AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(Val: N); |
1690 | assert((PlainStore || AtomicStore) && "Expected store" ); |
1691 | EVT StoreVT = ST->getMemoryVT(); |
1692 | SDNode *NVPTXST = nullptr; |
1693 | |
1694 | // do not support pre/post inc/dec |
1695 | if (PlainStore && PlainStore->isIndexed()) |
1696 | return false; |
1697 | |
1698 | if (!StoreVT.isSimple()) |
1699 | return false; |
1700 | |
1701 | AtomicOrdering Ordering = ST->getSuccessOrdering(); |
1702 | // In order to lower atomic loads with stronger guarantees we would need to |
1703 | // use store.release or insert fences. However these features were only added |
1704 | // with PTX ISA 6.0 / sm_70. |
1705 | // TODO: Check if we can actually use the new instructions and implement them. |
1706 | if (isStrongerThanMonotonic(AO: Ordering)) |
1707 | return false; |
1708 | |
1709 | // Address Space Setting |
1710 | unsigned int CodeAddrSpace = getCodeAddrSpace(N: ST); |
1711 | unsigned int PointerSize = |
1712 | CurDAG->getDataLayout().getPointerSizeInBits(AS: ST->getAddressSpace()); |
1713 | |
1714 | // Volatile Setting |
1715 | // - .volatile is only available for .global and .shared |
1716 | // - .volatile has the same memory synchronization semantics as .relaxed.sys |
1717 | bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; |
1718 | if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && |
1719 | CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && |
1720 | CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) |
1721 | isVolatile = false; |
1722 | |
1723 | // Vector Setting |
1724 | MVT SimpleVT = StoreVT.getSimpleVT(); |
1725 | unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; |
1726 | |
1727 | // Type Setting: toType + toTypeWidth |
1728 | // - for integer type, always use 'u' |
1729 | // |
1730 | MVT ScalarVT = SimpleVT.getScalarType(); |
1731 | unsigned toTypeWidth = ScalarVT.getSizeInBits(); |
1732 | if (SimpleVT.isVector()) { |
1733 | assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && |
1734 | "Unexpected vector type" ); |
1735 | // v2x16 is stored using st.b32 |
1736 | toTypeWidth = 32; |
1737 | } |
1738 | |
1739 | unsigned int toType = getLdStRegType(VT: ScalarVT); |
1740 | |
1741 | // Create the machine instruction DAG |
1742 | SDValue Chain = ST->getChain(); |
1743 | SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); |
1744 | SDValue BasePtr = ST->getBasePtr(); |
1745 | SDValue Addr; |
1746 | SDValue Offset, Base; |
1747 | std::optional<unsigned> Opcode; |
1748 | MVT::SimpleValueType SourceVT = |
1749 | Value.getNode()->getSimpleValueType(ResNo: 0).SimpleTy; |
1750 | |
1751 | if (SelectDirectAddr(N: BasePtr, Address&: Addr)) { |
1752 | Opcode = pickOpcodeForVT(VT: SourceVT, Opcode_i8: NVPTX::ST_i8_avar, Opcode_i16: NVPTX::ST_i16_avar, |
1753 | Opcode_i32: NVPTX::ST_i32_avar, Opcode_i64: NVPTX::ST_i64_avar, |
1754 | Opcode_f32: NVPTX::ST_f32_avar, Opcode_f64: NVPTX::ST_f64_avar); |
1755 | if (!Opcode) |
1756 | return false; |
1757 | SDValue Ops[] = {Value, |
1758 | getI32Imm(Imm: isVolatile, DL: dl), |
1759 | getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1760 | getI32Imm(Imm: vecType, DL: dl), |
1761 | getI32Imm(Imm: toType, DL: dl), |
1762 | getI32Imm(Imm: toTypeWidth, DL: dl), |
1763 | Addr, |
1764 | Chain}; |
1765 | NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl, VT: MVT::Other, Ops); |
1766 | } else if (PointerSize == 64 |
1767 | ? SelectADDRsi64(OpNode: BasePtr.getNode(), Addr: BasePtr, Base, Offset) |
1768 | : SelectADDRsi(OpNode: BasePtr.getNode(), Addr: BasePtr, Base, Offset)) { |
1769 | Opcode = pickOpcodeForVT(VT: SourceVT, Opcode_i8: NVPTX::ST_i8_asi, Opcode_i16: NVPTX::ST_i16_asi, |
1770 | Opcode_i32: NVPTX::ST_i32_asi, Opcode_i64: NVPTX::ST_i64_asi, |
1771 | Opcode_f32: NVPTX::ST_f32_asi, Opcode_f64: NVPTX::ST_f64_asi); |
1772 | if (!Opcode) |
1773 | return false; |
1774 | SDValue Ops[] = {Value, |
1775 | getI32Imm(Imm: isVolatile, DL: dl), |
1776 | getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1777 | getI32Imm(Imm: vecType, DL: dl), |
1778 | getI32Imm(Imm: toType, DL: dl), |
1779 | getI32Imm(Imm: toTypeWidth, DL: dl), |
1780 | Base, |
1781 | Offset, |
1782 | Chain}; |
1783 | NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl, VT: MVT::Other, Ops); |
1784 | } else if (PointerSize == 64 |
1785 | ? SelectADDRri64(OpNode: BasePtr.getNode(), Addr: BasePtr, Base, Offset) |
1786 | : SelectADDRri(OpNode: BasePtr.getNode(), Addr: BasePtr, Base, Offset)) { |
1787 | if (PointerSize == 64) |
1788 | Opcode = |
1789 | pickOpcodeForVT(VT: SourceVT, Opcode_i8: NVPTX::ST_i8_ari_64, Opcode_i16: NVPTX::ST_i16_ari_64, |
1790 | Opcode_i32: NVPTX::ST_i32_ari_64, Opcode_i64: NVPTX::ST_i64_ari_64, |
1791 | Opcode_f32: NVPTX::ST_f32_ari_64, Opcode_f64: NVPTX::ST_f64_ari_64); |
1792 | else |
1793 | Opcode = pickOpcodeForVT(VT: SourceVT, Opcode_i8: NVPTX::ST_i8_ari, Opcode_i16: NVPTX::ST_i16_ari, |
1794 | Opcode_i32: NVPTX::ST_i32_ari, Opcode_i64: NVPTX::ST_i64_ari, |
1795 | Opcode_f32: NVPTX::ST_f32_ari, Opcode_f64: NVPTX::ST_f64_ari); |
1796 | if (!Opcode) |
1797 | return false; |
1798 | |
1799 | SDValue Ops[] = {Value, |
1800 | getI32Imm(Imm: isVolatile, DL: dl), |
1801 | getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1802 | getI32Imm(Imm: vecType, DL: dl), |
1803 | getI32Imm(Imm: toType, DL: dl), |
1804 | getI32Imm(Imm: toTypeWidth, DL: dl), |
1805 | Base, |
1806 | Offset, |
1807 | Chain}; |
1808 | NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl, VT: MVT::Other, Ops); |
1809 | } else { |
1810 | if (PointerSize == 64) |
1811 | Opcode = |
1812 | pickOpcodeForVT(VT: SourceVT, Opcode_i8: NVPTX::ST_i8_areg_64, Opcode_i16: NVPTX::ST_i16_areg_64, |
1813 | Opcode_i32: NVPTX::ST_i32_areg_64, Opcode_i64: NVPTX::ST_i64_areg_64, |
1814 | Opcode_f32: NVPTX::ST_f32_areg_64, Opcode_f64: NVPTX::ST_f64_areg_64); |
1815 | else |
1816 | Opcode = pickOpcodeForVT(VT: SourceVT, Opcode_i8: NVPTX::ST_i8_areg, Opcode_i16: NVPTX::ST_i16_areg, |
1817 | Opcode_i32: NVPTX::ST_i32_areg, Opcode_i64: NVPTX::ST_i64_areg, |
1818 | Opcode_f32: NVPTX::ST_f32_areg, Opcode_f64: NVPTX::ST_f64_areg); |
1819 | if (!Opcode) |
1820 | return false; |
1821 | SDValue Ops[] = {Value, |
1822 | getI32Imm(Imm: isVolatile, DL: dl), |
1823 | getI32Imm(Imm: CodeAddrSpace, DL: dl), |
1824 | getI32Imm(Imm: vecType, DL: dl), |
1825 | getI32Imm(Imm: toType, DL: dl), |
1826 | getI32Imm(Imm: toTypeWidth, DL: dl), |
1827 | BasePtr, |
1828 | Chain}; |
1829 | NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl, VT: MVT::Other, Ops); |
1830 | } |
1831 | |
1832 | if (!NVPTXST) |
1833 | return false; |
1834 | |
1835 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
1836 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef}); |
1837 | ReplaceNode(F: N, T: NVPTXST); |
1838 | return true; |
1839 | } |
1840 | |
1841 | bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { |
1842 | SDValue Chain = N->getOperand(Num: 0); |
1843 | SDValue Op1 = N->getOperand(Num: 1); |
1844 | SDValue Addr, Offset, Base; |
1845 | std::optional<unsigned> Opcode; |
1846 | SDLoc DL(N); |
1847 | SDNode *ST; |
1848 | EVT EltVT = Op1.getValueType(); |
1849 | MemSDNode *MemSD = cast<MemSDNode>(Val: N); |
1850 | EVT StoreVT = MemSD->getMemoryVT(); |
1851 | |
1852 | // Address Space Setting |
1853 | unsigned CodeAddrSpace = getCodeAddrSpace(N: MemSD); |
1854 | if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { |
1855 | report_fatal_error(reason: "Cannot store to pointer that points to constant " |
1856 | "memory space" ); |
1857 | } |
1858 | unsigned int PointerSize = |
1859 | CurDAG->getDataLayout().getPointerSizeInBits(AS: MemSD->getAddressSpace()); |
1860 | |
1861 | // Volatile Setting |
1862 | // - .volatile is only availalble for .global and .shared |
1863 | bool IsVolatile = MemSD->isVolatile(); |
1864 | if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && |
1865 | CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && |
1866 | CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) |
1867 | IsVolatile = false; |
1868 | |
1869 | // Type Setting: toType + toTypeWidth |
1870 | // - for integer type, always use 'u' |
1871 | assert(StoreVT.isSimple() && "Store value is not simple" ); |
1872 | MVT ScalarVT = StoreVT.getSimpleVT().getScalarType(); |
1873 | unsigned ToTypeWidth = ScalarVT.getSizeInBits(); |
1874 | unsigned ToType = getLdStRegType(VT: ScalarVT); |
1875 | |
1876 | SmallVector<SDValue, 12> StOps; |
1877 | SDValue N2; |
1878 | unsigned VecType; |
1879 | |
1880 | switch (N->getOpcode()) { |
1881 | case NVPTXISD::StoreV2: |
1882 | VecType = NVPTX::PTXLdStInstCode::V2; |
1883 | StOps.push_back(Elt: N->getOperand(Num: 1)); |
1884 | StOps.push_back(Elt: N->getOperand(Num: 2)); |
1885 | N2 = N->getOperand(Num: 3); |
1886 | break; |
1887 | case NVPTXISD::StoreV4: |
1888 | VecType = NVPTX::PTXLdStInstCode::V4; |
1889 | StOps.push_back(Elt: N->getOperand(Num: 1)); |
1890 | StOps.push_back(Elt: N->getOperand(Num: 2)); |
1891 | StOps.push_back(Elt: N->getOperand(Num: 3)); |
1892 | StOps.push_back(Elt: N->getOperand(Num: 4)); |
1893 | N2 = N->getOperand(Num: 5); |
1894 | break; |
1895 | default: |
1896 | return false; |
1897 | } |
1898 | |
1899 | // v8x16 is a special case. PTX doesn't have st.v8.x16 |
1900 | // instruction. Instead, we split the vector into v2x16 chunks and |
1901 | // store them with st.v4.b32. |
1902 | if (Isv2x16VT(VT: EltVT)) { |
1903 | assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode." ); |
1904 | EltVT = MVT::i32; |
1905 | ToType = NVPTX::PTXLdStInstCode::Untyped; |
1906 | ToTypeWidth = 32; |
1907 | } |
1908 | |
1909 | StOps.push_back(Elt: getI32Imm(Imm: IsVolatile, DL)); |
1910 | StOps.push_back(Elt: getI32Imm(Imm: CodeAddrSpace, DL)); |
1911 | StOps.push_back(Elt: getI32Imm(Imm: VecType, DL)); |
1912 | StOps.push_back(Elt: getI32Imm(Imm: ToType, DL)); |
1913 | StOps.push_back(Elt: getI32Imm(Imm: ToTypeWidth, DL)); |
1914 | |
1915 | if (SelectDirectAddr(N: N2, Address&: Addr)) { |
1916 | switch (N->getOpcode()) { |
1917 | default: |
1918 | return false; |
1919 | case NVPTXISD::StoreV2: |
1920 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1921 | Opcode_i8: NVPTX::STV_i8_v2_avar, Opcode_i16: NVPTX::STV_i16_v2_avar, |
1922 | Opcode_i32: NVPTX::STV_i32_v2_avar, Opcode_i64: NVPTX::STV_i64_v2_avar, |
1923 | Opcode_f32: NVPTX::STV_f32_v2_avar, Opcode_f64: NVPTX::STV_f64_v2_avar); |
1924 | break; |
1925 | case NVPTXISD::StoreV4: |
1926 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1927 | Opcode_i8: NVPTX::STV_i8_v4_avar, Opcode_i16: NVPTX::STV_i16_v4_avar, |
1928 | Opcode_i32: NVPTX::STV_i32_v4_avar, Opcode_i64: std::nullopt, |
1929 | Opcode_f32: NVPTX::STV_f32_v4_avar, Opcode_f64: std::nullopt); |
1930 | break; |
1931 | } |
1932 | StOps.push_back(Elt: Addr); |
1933 | } else if (PointerSize == 64 ? SelectADDRsi64(OpNode: N2.getNode(), Addr: N2, Base, Offset) |
1934 | : SelectADDRsi(OpNode: N2.getNode(), Addr: N2, Base, Offset)) { |
1935 | switch (N->getOpcode()) { |
1936 | default: |
1937 | return false; |
1938 | case NVPTXISD::StoreV2: |
1939 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1940 | Opcode_i8: NVPTX::STV_i8_v2_asi, Opcode_i16: NVPTX::STV_i16_v2_asi, |
1941 | Opcode_i32: NVPTX::STV_i32_v2_asi, Opcode_i64: NVPTX::STV_i64_v2_asi, |
1942 | Opcode_f32: NVPTX::STV_f32_v2_asi, Opcode_f64: NVPTX::STV_f64_v2_asi); |
1943 | break; |
1944 | case NVPTXISD::StoreV4: |
1945 | Opcode = |
1946 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::STV_i8_v4_asi, |
1947 | Opcode_i16: NVPTX::STV_i16_v4_asi, Opcode_i32: NVPTX::STV_i32_v4_asi, |
1948 | Opcode_i64: std::nullopt, Opcode_f32: NVPTX::STV_f32_v4_asi, Opcode_f64: std::nullopt); |
1949 | break; |
1950 | } |
1951 | StOps.push_back(Elt: Base); |
1952 | StOps.push_back(Elt: Offset); |
1953 | } else if (PointerSize == 64 ? SelectADDRri64(OpNode: N2.getNode(), Addr: N2, Base, Offset) |
1954 | : SelectADDRri(OpNode: N2.getNode(), Addr: N2, Base, Offset)) { |
1955 | if (PointerSize == 64) { |
1956 | switch (N->getOpcode()) { |
1957 | default: |
1958 | return false; |
1959 | case NVPTXISD::StoreV2: |
1960 | Opcode = |
1961 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1962 | Opcode_i8: NVPTX::STV_i8_v2_ari_64, Opcode_i16: NVPTX::STV_i16_v2_ari_64, |
1963 | Opcode_i32: NVPTX::STV_i32_v2_ari_64, Opcode_i64: NVPTX::STV_i64_v2_ari_64, |
1964 | Opcode_f32: NVPTX::STV_f32_v2_ari_64, Opcode_f64: NVPTX::STV_f64_v2_ari_64); |
1965 | break; |
1966 | case NVPTXISD::StoreV4: |
1967 | Opcode = pickOpcodeForVT( |
1968 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::STV_i8_v4_ari_64, |
1969 | Opcode_i16: NVPTX::STV_i16_v4_ari_64, Opcode_i32: NVPTX::STV_i32_v4_ari_64, Opcode_i64: std::nullopt, |
1970 | Opcode_f32: NVPTX::STV_f32_v4_ari_64, Opcode_f64: std::nullopt); |
1971 | break; |
1972 | } |
1973 | } else { |
1974 | switch (N->getOpcode()) { |
1975 | default: |
1976 | return false; |
1977 | case NVPTXISD::StoreV2: |
1978 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1979 | Opcode_i8: NVPTX::STV_i8_v2_ari, Opcode_i16: NVPTX::STV_i16_v2_ari, |
1980 | Opcode_i32: NVPTX::STV_i32_v2_ari, Opcode_i64: NVPTX::STV_i64_v2_ari, |
1981 | Opcode_f32: NVPTX::STV_f32_v2_ari, Opcode_f64: NVPTX::STV_f64_v2_ari); |
1982 | break; |
1983 | case NVPTXISD::StoreV4: |
1984 | Opcode = pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, |
1985 | Opcode_i8: NVPTX::STV_i8_v4_ari, Opcode_i16: NVPTX::STV_i16_v4_ari, |
1986 | Opcode_i32: NVPTX::STV_i32_v4_ari, Opcode_i64: std::nullopt, |
1987 | Opcode_f32: NVPTX::STV_f32_v4_ari, Opcode_f64: std::nullopt); |
1988 | break; |
1989 | } |
1990 | } |
1991 | StOps.push_back(Elt: Base); |
1992 | StOps.push_back(Elt: Offset); |
1993 | } else { |
1994 | if (PointerSize == 64) { |
1995 | switch (N->getOpcode()) { |
1996 | default: |
1997 | return false; |
1998 | case NVPTXISD::StoreV2: |
1999 | Opcode = pickOpcodeForVT( |
2000 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::STV_i8_v2_areg_64, |
2001 | Opcode_i16: NVPTX::STV_i16_v2_areg_64, Opcode_i32: NVPTX::STV_i32_v2_areg_64, |
2002 | Opcode_i64: NVPTX::STV_i64_v2_areg_64, Opcode_f32: NVPTX::STV_f32_v2_areg_64, |
2003 | Opcode_f64: NVPTX::STV_f64_v2_areg_64); |
2004 | break; |
2005 | case NVPTXISD::StoreV4: |
2006 | Opcode = pickOpcodeForVT( |
2007 | VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::STV_i8_v4_areg_64, |
2008 | Opcode_i16: NVPTX::STV_i16_v4_areg_64, Opcode_i32: NVPTX::STV_i32_v4_areg_64, Opcode_i64: std::nullopt, |
2009 | Opcode_f32: NVPTX::STV_f32_v4_areg_64, Opcode_f64: std::nullopt); |
2010 | break; |
2011 | } |
2012 | } else { |
2013 | switch (N->getOpcode()) { |
2014 | default: |
2015 | return false; |
2016 | case NVPTXISD::StoreV2: |
2017 | Opcode = |
2018 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::STV_i8_v2_areg, |
2019 | Opcode_i16: NVPTX::STV_i16_v2_areg, Opcode_i32: NVPTX::STV_i32_v2_areg, |
2020 | Opcode_i64: NVPTX::STV_i64_v2_areg, Opcode_f32: NVPTX::STV_f32_v2_areg, |
2021 | Opcode_f64: NVPTX::STV_f64_v2_areg); |
2022 | break; |
2023 | case NVPTXISD::StoreV4: |
2024 | Opcode = |
2025 | pickOpcodeForVT(VT: EltVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::STV_i8_v4_areg, |
2026 | Opcode_i16: NVPTX::STV_i16_v4_areg, Opcode_i32: NVPTX::STV_i32_v4_areg, |
2027 | Opcode_i64: std::nullopt, Opcode_f32: NVPTX::STV_f32_v4_areg, Opcode_f64: std::nullopt); |
2028 | break; |
2029 | } |
2030 | } |
2031 | StOps.push_back(Elt: N2); |
2032 | } |
2033 | |
2034 | if (!Opcode) |
2035 | return false; |
2036 | |
2037 | StOps.push_back(Elt: Chain); |
2038 | |
2039 | ST = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VT: MVT::Other, Ops: StOps); |
2040 | |
2041 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
2042 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: ST), NewMemRefs: {MemRef}); |
2043 | |
2044 | ReplaceNode(F: N, T: ST); |
2045 | return true; |
2046 | } |
2047 | |
2048 | bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { |
2049 | SDValue Chain = Node->getOperand(Num: 0); |
2050 | SDValue Offset = Node->getOperand(Num: 2); |
2051 | SDValue Glue = Node->getOperand(Num: 3); |
2052 | SDLoc DL(Node); |
2053 | MemSDNode *Mem = cast<MemSDNode>(Val: Node); |
2054 | |
2055 | unsigned VecSize; |
2056 | switch (Node->getOpcode()) { |
2057 | default: |
2058 | return false; |
2059 | case NVPTXISD::LoadParam: |
2060 | VecSize = 1; |
2061 | break; |
2062 | case NVPTXISD::LoadParamV2: |
2063 | VecSize = 2; |
2064 | break; |
2065 | case NVPTXISD::LoadParamV4: |
2066 | VecSize = 4; |
2067 | break; |
2068 | } |
2069 | |
2070 | EVT EltVT = Node->getValueType(ResNo: 0); |
2071 | EVT MemVT = Mem->getMemoryVT(); |
2072 | |
2073 | std::optional<unsigned> Opcode; |
2074 | |
2075 | switch (VecSize) { |
2076 | default: |
2077 | return false; |
2078 | case 1: |
2079 | Opcode = pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy, |
2080 | Opcode_i8: NVPTX::LoadParamMemI8, Opcode_i16: NVPTX::LoadParamMemI16, |
2081 | Opcode_i32: NVPTX::LoadParamMemI32, Opcode_i64: NVPTX::LoadParamMemI64, |
2082 | Opcode_f32: NVPTX::LoadParamMemF32, Opcode_f64: NVPTX::LoadParamMemF64); |
2083 | break; |
2084 | case 2: |
2085 | Opcode = |
2086 | pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LoadParamMemV2I8, |
2087 | Opcode_i16: NVPTX::LoadParamMemV2I16, Opcode_i32: NVPTX::LoadParamMemV2I32, |
2088 | Opcode_i64: NVPTX::LoadParamMemV2I64, Opcode_f32: NVPTX::LoadParamMemV2F32, |
2089 | Opcode_f64: NVPTX::LoadParamMemV2F64); |
2090 | break; |
2091 | case 4: |
2092 | Opcode = |
2093 | pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LoadParamMemV4I8, |
2094 | Opcode_i16: NVPTX::LoadParamMemV4I16, Opcode_i32: NVPTX::LoadParamMemV4I32, |
2095 | Opcode_i64: std::nullopt, Opcode_f32: NVPTX::LoadParamMemV4F32, Opcode_f64: std::nullopt); |
2096 | break; |
2097 | } |
2098 | if (!Opcode) |
2099 | return false; |
2100 | |
2101 | SDVTList VTs; |
2102 | if (VecSize == 1) { |
2103 | VTs = CurDAG->getVTList(VT1: EltVT, VT2: MVT::Other, VT3: MVT::Glue); |
2104 | } else if (VecSize == 2) { |
2105 | VTs = CurDAG->getVTList(VT1: EltVT, VT2: EltVT, VT3: MVT::Other, VT4: MVT::Glue); |
2106 | } else { |
2107 | EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; |
2108 | VTs = CurDAG->getVTList(VTs: EVTs); |
2109 | } |
2110 | |
2111 | unsigned OffsetVal = Offset->getAsZExtVal(); |
2112 | |
2113 | SmallVector<SDValue, 2> Ops; |
2114 | Ops.push_back(Elt: CurDAG->getTargetConstant(Val: OffsetVal, DL, VT: MVT::i32)); |
2115 | Ops.push_back(Elt: Chain); |
2116 | Ops.push_back(Elt: Glue); |
2117 | |
2118 | ReplaceNode(F: Node, T: CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs, Ops)); |
2119 | return true; |
2120 | } |
2121 | |
2122 | bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { |
2123 | SDLoc DL(N); |
2124 | SDValue Chain = N->getOperand(Num: 0); |
2125 | SDValue Offset = N->getOperand(Num: 1); |
2126 | unsigned OffsetVal = Offset->getAsZExtVal(); |
2127 | MemSDNode *Mem = cast<MemSDNode>(Val: N); |
2128 | |
2129 | // How many elements do we have? |
2130 | unsigned NumElts = 1; |
2131 | switch (N->getOpcode()) { |
2132 | default: |
2133 | return false; |
2134 | case NVPTXISD::StoreRetval: |
2135 | NumElts = 1; |
2136 | break; |
2137 | case NVPTXISD::StoreRetvalV2: |
2138 | NumElts = 2; |
2139 | break; |
2140 | case NVPTXISD::StoreRetvalV4: |
2141 | NumElts = 4; |
2142 | break; |
2143 | } |
2144 | |
2145 | // Build vector of operands |
2146 | SmallVector<SDValue, 6> Ops; |
2147 | for (unsigned i = 0; i < NumElts; ++i) |
2148 | Ops.push_back(Elt: N->getOperand(Num: i + 2)); |
2149 | Ops.push_back(Elt: CurDAG->getTargetConstant(Val: OffsetVal, DL, VT: MVT::i32)); |
2150 | Ops.push_back(Elt: Chain); |
2151 | |
2152 | // Determine target opcode |
2153 | // If we have an i1, use an 8-bit store. The lowering code in |
2154 | // NVPTXISelLowering will have already emitted an upcast. |
2155 | std::optional<unsigned> Opcode = 0; |
2156 | switch (NumElts) { |
2157 | default: |
2158 | return false; |
2159 | case 1: |
2160 | Opcode = pickOpcodeForVT(VT: Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2161 | Opcode_i8: NVPTX::StoreRetvalI8, Opcode_i16: NVPTX::StoreRetvalI16, |
2162 | Opcode_i32: NVPTX::StoreRetvalI32, Opcode_i64: NVPTX::StoreRetvalI64, |
2163 | Opcode_f32: NVPTX::StoreRetvalF32, Opcode_f64: NVPTX::StoreRetvalF64); |
2164 | if (Opcode == NVPTX::StoreRetvalI8) { |
2165 | // Fine tune the opcode depending on the size of the operand. |
2166 | // This helps to avoid creating redundant COPY instructions in |
2167 | // InstrEmitter::AddRegisterOperand(). |
2168 | switch (Ops[0].getSimpleValueType().SimpleTy) { |
2169 | default: |
2170 | break; |
2171 | case MVT::i32: |
2172 | Opcode = NVPTX::StoreRetvalI8TruncI32; |
2173 | break; |
2174 | case MVT::i64: |
2175 | Opcode = NVPTX::StoreRetvalI8TruncI64; |
2176 | break; |
2177 | } |
2178 | } |
2179 | break; |
2180 | case 2: |
2181 | Opcode = pickOpcodeForVT(VT: Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2182 | Opcode_i8: NVPTX::StoreRetvalV2I8, Opcode_i16: NVPTX::StoreRetvalV2I16, |
2183 | Opcode_i32: NVPTX::StoreRetvalV2I32, Opcode_i64: NVPTX::StoreRetvalV2I64, |
2184 | Opcode_f32: NVPTX::StoreRetvalV2F32, Opcode_f64: NVPTX::StoreRetvalV2F64); |
2185 | break; |
2186 | case 4: |
2187 | Opcode = pickOpcodeForVT(VT: Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2188 | Opcode_i8: NVPTX::StoreRetvalV4I8, Opcode_i16: NVPTX::StoreRetvalV4I16, |
2189 | Opcode_i32: NVPTX::StoreRetvalV4I32, Opcode_i64: std::nullopt, |
2190 | Opcode_f32: NVPTX::StoreRetvalV4F32, Opcode_f64: std::nullopt); |
2191 | break; |
2192 | } |
2193 | if (!Opcode) |
2194 | return false; |
2195 | |
2196 | SDNode *Ret = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VT: MVT::Other, Ops); |
2197 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
2198 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Ret), NewMemRefs: {MemRef}); |
2199 | |
2200 | ReplaceNode(F: N, T: Ret); |
2201 | return true; |
2202 | } |
2203 | |
2204 | // Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri) |
2205 | #define getOpcV2H(ty, opKind0, opKind1) \ |
2206 | NVPTX::StoreParamV2##ty##_##opKind0##opKind1 |
2207 | |
2208 | #define getOpcV2H1(ty, opKind0, isImm1) \ |
2209 | (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r) |
2210 | |
2211 | #define getOpcodeForVectorStParamV2(ty, isimm) \ |
2212 | (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1]) |
2213 | |
2214 | #define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \ |
2215 | NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3 |
2216 | |
2217 | #define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \ |
2218 | (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \ |
2219 | : getOpcV4H(ty, opKind0, opKind1, opKind2, r) |
2220 | |
2221 | #define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \ |
2222 | (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \ |
2223 | : getOpcV4H3(ty, opKind0, opKind1, r, isImm3) |
2224 | |
2225 | #define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \ |
2226 | (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \ |
2227 | : getOpcV4H2(ty, opKind0, r, isImm2, isImm3) |
2228 | |
2229 | #define getOpcodeForVectorStParamV4(ty, isimm) \ |
2230 | (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \ |
2231 | : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3]) |
2232 | |
2233 | #define getOpcodeForVectorStParam(n, ty, isimm) \ |
2234 | (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \ |
2235 | : getOpcodeForVectorStParamV4(ty, isimm) |
2236 | |
2237 | static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, |
2238 | unsigned NumElts, |
2239 | MVT::SimpleValueType MemTy, |
2240 | SelectionDAG *CurDAG, SDLoc DL) { |
2241 | // Determine which inputs are registers and immediates make new operators |
2242 | // with constant values |
2243 | SmallVector<bool, 4> IsImm(NumElts, false); |
2244 | for (unsigned i = 0; i < NumElts; i++) { |
2245 | IsImm[i] = (isa<ConstantSDNode>(Val: Ops[i]) || isa<ConstantFPSDNode>(Val: Ops[i])); |
2246 | if (IsImm[i]) { |
2247 | SDValue Imm = Ops[i]; |
2248 | if (MemTy == MVT::f32 || MemTy == MVT::f64) { |
2249 | const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Val&: Imm); |
2250 | const ConstantFP *CF = ConstImm->getConstantFPValue(); |
2251 | Imm = CurDAG->getTargetConstantFP(Val: *CF, DL, VT: Imm->getValueType(ResNo: 0)); |
2252 | } else { |
2253 | const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Val&: Imm); |
2254 | const ConstantInt *CI = ConstImm->getConstantIntValue(); |
2255 | Imm = CurDAG->getTargetConstant(Val: *CI, DL, VT: Imm->getValueType(ResNo: 0)); |
2256 | } |
2257 | Ops[i] = Imm; |
2258 | } |
2259 | } |
2260 | |
2261 | // Get opcode for MemTy, size, and register/immediate operand ordering |
2262 | switch (MemTy) { |
2263 | case MVT::i8: |
2264 | return getOpcodeForVectorStParam(NumElts, I8, IsImm); |
2265 | case MVT::i16: |
2266 | return getOpcodeForVectorStParam(NumElts, I16, IsImm); |
2267 | case MVT::i32: |
2268 | return getOpcodeForVectorStParam(NumElts, I32, IsImm); |
2269 | case MVT::i64: |
2270 | assert(NumElts == 2 && "MVT too large for NumElts > 2" ); |
2271 | return getOpcodeForVectorStParamV2(I64, IsImm); |
2272 | case MVT::f32: |
2273 | return getOpcodeForVectorStParam(NumElts, F32, IsImm); |
2274 | case MVT::f64: |
2275 | assert(NumElts == 2 && "MVT too large for NumElts > 2" ); |
2276 | return getOpcodeForVectorStParamV2(F64, IsImm); |
2277 | |
2278 | // These cases don't support immediates, just use the all register version |
2279 | // and generate moves. |
2280 | case MVT::i1: |
2281 | return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr |
2282 | : NVPTX::StoreParamV4I8_rrrr; |
2283 | case MVT::f16: |
2284 | case MVT::bf16: |
2285 | return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr |
2286 | : NVPTX::StoreParamV4I16_rrrr; |
2287 | case MVT::v2f16: |
2288 | case MVT::v2bf16: |
2289 | case MVT::v2i16: |
2290 | case MVT::v4i8: |
2291 | return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr |
2292 | : NVPTX::StoreParamV4I32_rrrr; |
2293 | default: |
2294 | llvm_unreachable("Cannot select st.param for unknown MemTy" ); |
2295 | } |
2296 | } |
2297 | |
2298 | bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { |
2299 | SDLoc DL(N); |
2300 | SDValue Chain = N->getOperand(Num: 0); |
2301 | SDValue Param = N->getOperand(Num: 1); |
2302 | unsigned ParamVal = Param->getAsZExtVal(); |
2303 | SDValue Offset = N->getOperand(Num: 2); |
2304 | unsigned OffsetVal = Offset->getAsZExtVal(); |
2305 | MemSDNode *Mem = cast<MemSDNode>(Val: N); |
2306 | SDValue Glue = N->getOperand(Num: N->getNumOperands() - 1); |
2307 | |
2308 | // How many elements do we have? |
2309 | unsigned NumElts; |
2310 | switch (N->getOpcode()) { |
2311 | default: |
2312 | llvm_unreachable("Unexpected opcode" ); |
2313 | case NVPTXISD::StoreParamU32: |
2314 | case NVPTXISD::StoreParamS32: |
2315 | case NVPTXISD::StoreParam: |
2316 | NumElts = 1; |
2317 | break; |
2318 | case NVPTXISD::StoreParamV2: |
2319 | NumElts = 2; |
2320 | break; |
2321 | case NVPTXISD::StoreParamV4: |
2322 | NumElts = 4; |
2323 | break; |
2324 | } |
2325 | |
2326 | // Build vector of operands |
2327 | SmallVector<SDValue, 8> Ops; |
2328 | for (unsigned i = 0; i < NumElts; ++i) |
2329 | Ops.push_back(Elt: N->getOperand(Num: i + 3)); |
2330 | Ops.push_back(Elt: CurDAG->getTargetConstant(Val: ParamVal, DL, VT: MVT::i32)); |
2331 | Ops.push_back(Elt: CurDAG->getTargetConstant(Val: OffsetVal, DL, VT: MVT::i32)); |
2332 | Ops.push_back(Elt: Chain); |
2333 | Ops.push_back(Elt: Glue); |
2334 | |
2335 | // Determine target opcode |
2336 | // If we have an i1, use an 8-bit store. The lowering code in |
2337 | // NVPTXISelLowering will have already emitted an upcast. |
2338 | std::optional<unsigned> Opcode; |
2339 | switch (N->getOpcode()) { |
2340 | default: |
2341 | switch (NumElts) { |
2342 | default: |
2343 | llvm_unreachable("Unexpected NumElts" ); |
2344 | case 1: { |
2345 | MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; |
2346 | SDValue Imm = Ops[0]; |
2347 | if (MemTy != MVT::f16 && MemTy != MVT::v2f16 && |
2348 | (isa<ConstantSDNode>(Val: Imm) || isa<ConstantFPSDNode>(Val: Imm))) { |
2349 | // Convert immediate to target constant |
2350 | if (MemTy == MVT::f32 || MemTy == MVT::f64) { |
2351 | const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Val&: Imm); |
2352 | const ConstantFP *CF = ConstImm->getConstantFPValue(); |
2353 | Imm = CurDAG->getTargetConstantFP(Val: *CF, DL, VT: Imm->getValueType(ResNo: 0)); |
2354 | } else { |
2355 | const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Val&: Imm); |
2356 | const ConstantInt *CI = ConstImm->getConstantIntValue(); |
2357 | Imm = CurDAG->getTargetConstant(Val: *CI, DL, VT: Imm->getValueType(ResNo: 0)); |
2358 | } |
2359 | Ops[0] = Imm; |
2360 | // Use immediate version of store param |
2361 | Opcode = pickOpcodeForVT(VT: MemTy, Opcode_i8: NVPTX::StoreParamI8_i, |
2362 | Opcode_i16: NVPTX::StoreParamI16_i, Opcode_i32: NVPTX::StoreParamI32_i, |
2363 | Opcode_i64: NVPTX::StoreParamI64_i, Opcode_f32: NVPTX::StoreParamF32_i, |
2364 | Opcode_f64: NVPTX::StoreParamF64_i); |
2365 | } else |
2366 | Opcode = |
2367 | pickOpcodeForVT(VT: Mem->getMemoryVT().getSimpleVT().SimpleTy, |
2368 | Opcode_i8: NVPTX::StoreParamI8_r, Opcode_i16: NVPTX::StoreParamI16_r, |
2369 | Opcode_i32: NVPTX::StoreParamI32_r, Opcode_i64: NVPTX::StoreParamI64_r, |
2370 | Opcode_f32: NVPTX::StoreParamF32_r, Opcode_f64: NVPTX::StoreParamF64_r); |
2371 | if (Opcode == NVPTX::StoreParamI8_r) { |
2372 | // Fine tune the opcode depending on the size of the operand. |
2373 | // This helps to avoid creating redundant COPY instructions in |
2374 | // InstrEmitter::AddRegisterOperand(). |
2375 | switch (Ops[0].getSimpleValueType().SimpleTy) { |
2376 | default: |
2377 | break; |
2378 | case MVT::i32: |
2379 | Opcode = NVPTX::StoreParamI8TruncI32_r; |
2380 | break; |
2381 | case MVT::i64: |
2382 | Opcode = NVPTX::StoreParamI8TruncI64_r; |
2383 | break; |
2384 | } |
2385 | } |
2386 | break; |
2387 | } |
2388 | case 2: |
2389 | case 4: { |
2390 | MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; |
2391 | Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL); |
2392 | break; |
2393 | } |
2394 | } |
2395 | break; |
2396 | // Special case: if we have a sign-extend/zero-extend node, insert the |
2397 | // conversion instruction first, and use that as the value operand to |
2398 | // the selected StoreParam node. |
2399 | case NVPTXISD::StoreParamU32: { |
2400 | Opcode = NVPTX::StoreParamI32_r; |
2401 | SDValue CvtNone = CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, |
2402 | VT: MVT::i32); |
2403 | SDNode *Cvt = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u32_u16, dl: DL, |
2404 | VT: MVT::i32, Op1: Ops[0], Op2: CvtNone); |
2405 | Ops[0] = SDValue(Cvt, 0); |
2406 | break; |
2407 | } |
2408 | case NVPTXISD::StoreParamS32: { |
2409 | Opcode = NVPTX::StoreParamI32_r; |
2410 | SDValue CvtNone = CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, |
2411 | VT: MVT::i32); |
2412 | SDNode *Cvt = CurDAG->getMachineNode(Opcode: NVPTX::CVT_s32_s16, dl: DL, |
2413 | VT: MVT::i32, Op1: Ops[0], Op2: CvtNone); |
2414 | Ops[0] = SDValue(Cvt, 0); |
2415 | break; |
2416 | } |
2417 | } |
2418 | |
2419 | SDVTList RetVTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
2420 | SDNode *Ret = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: RetVTs, Ops); |
2421 | MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand(); |
2422 | CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Ret), NewMemRefs: {MemRef}); |
2423 | |
2424 | ReplaceNode(F: N, T: Ret); |
2425 | return true; |
2426 | } |
2427 | |
2428 | bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { |
2429 | unsigned Opc = 0; |
2430 | |
2431 | switch (N->getOpcode()) { |
2432 | default: return false; |
2433 | case NVPTXISD::Tex1DFloatS32: |
2434 | Opc = NVPTX::TEX_1D_F32_S32_RR; |
2435 | break; |
2436 | case NVPTXISD::Tex1DFloatFloat: |
2437 | Opc = NVPTX::TEX_1D_F32_F32_RR; |
2438 | break; |
2439 | case NVPTXISD::Tex1DFloatFloatLevel: |
2440 | Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR; |
2441 | break; |
2442 | case NVPTXISD::Tex1DFloatFloatGrad: |
2443 | Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR; |
2444 | break; |
2445 | case NVPTXISD::Tex1DS32S32: |
2446 | Opc = NVPTX::TEX_1D_S32_S32_RR; |
2447 | break; |
2448 | case NVPTXISD::Tex1DS32Float: |
2449 | Opc = NVPTX::TEX_1D_S32_F32_RR; |
2450 | break; |
2451 | case NVPTXISD::Tex1DS32FloatLevel: |
2452 | Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR; |
2453 | break; |
2454 | case NVPTXISD::Tex1DS32FloatGrad: |
2455 | Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR; |
2456 | break; |
2457 | case NVPTXISD::Tex1DU32S32: |
2458 | Opc = NVPTX::TEX_1D_U32_S32_RR; |
2459 | break; |
2460 | case NVPTXISD::Tex1DU32Float: |
2461 | Opc = NVPTX::TEX_1D_U32_F32_RR; |
2462 | break; |
2463 | case NVPTXISD::Tex1DU32FloatLevel: |
2464 | Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR; |
2465 | break; |
2466 | case NVPTXISD::Tex1DU32FloatGrad: |
2467 | Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR; |
2468 | break; |
2469 | case NVPTXISD::Tex1DArrayFloatS32: |
2470 | Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR; |
2471 | break; |
2472 | case NVPTXISD::Tex1DArrayFloatFloat: |
2473 | Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR; |
2474 | break; |
2475 | case NVPTXISD::Tex1DArrayFloatFloatLevel: |
2476 | Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR; |
2477 | break; |
2478 | case NVPTXISD::Tex1DArrayFloatFloatGrad: |
2479 | Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR; |
2480 | break; |
2481 | case NVPTXISD::Tex1DArrayS32S32: |
2482 | Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR; |
2483 | break; |
2484 | case NVPTXISD::Tex1DArrayS32Float: |
2485 | Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR; |
2486 | break; |
2487 | case NVPTXISD::Tex1DArrayS32FloatLevel: |
2488 | Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR; |
2489 | break; |
2490 | case NVPTXISD::Tex1DArrayS32FloatGrad: |
2491 | Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR; |
2492 | break; |
2493 | case NVPTXISD::Tex1DArrayU32S32: |
2494 | Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR; |
2495 | break; |
2496 | case NVPTXISD::Tex1DArrayU32Float: |
2497 | Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR; |
2498 | break; |
2499 | case NVPTXISD::Tex1DArrayU32FloatLevel: |
2500 | Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR; |
2501 | break; |
2502 | case NVPTXISD::Tex1DArrayU32FloatGrad: |
2503 | Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR; |
2504 | break; |
2505 | case NVPTXISD::Tex2DFloatS32: |
2506 | Opc = NVPTX::TEX_2D_F32_S32_RR; |
2507 | break; |
2508 | case NVPTXISD::Tex2DFloatFloat: |
2509 | Opc = NVPTX::TEX_2D_F32_F32_RR; |
2510 | break; |
2511 | case NVPTXISD::Tex2DFloatFloatLevel: |
2512 | Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR; |
2513 | break; |
2514 | case NVPTXISD::Tex2DFloatFloatGrad: |
2515 | Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR; |
2516 | break; |
2517 | case NVPTXISD::Tex2DS32S32: |
2518 | Opc = NVPTX::TEX_2D_S32_S32_RR; |
2519 | break; |
2520 | case NVPTXISD::Tex2DS32Float: |
2521 | Opc = NVPTX::TEX_2D_S32_F32_RR; |
2522 | break; |
2523 | case NVPTXISD::Tex2DS32FloatLevel: |
2524 | Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR; |
2525 | break; |
2526 | case NVPTXISD::Tex2DS32FloatGrad: |
2527 | Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR; |
2528 | break; |
2529 | case NVPTXISD::Tex2DU32S32: |
2530 | Opc = NVPTX::TEX_2D_U32_S32_RR; |
2531 | break; |
2532 | case NVPTXISD::Tex2DU32Float: |
2533 | Opc = NVPTX::TEX_2D_U32_F32_RR; |
2534 | break; |
2535 | case NVPTXISD::Tex2DU32FloatLevel: |
2536 | Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR; |
2537 | break; |
2538 | case NVPTXISD::Tex2DU32FloatGrad: |
2539 | Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR; |
2540 | break; |
2541 | case NVPTXISD::Tex2DArrayFloatS32: |
2542 | Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR; |
2543 | break; |
2544 | case NVPTXISD::Tex2DArrayFloatFloat: |
2545 | Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR; |
2546 | break; |
2547 | case NVPTXISD::Tex2DArrayFloatFloatLevel: |
2548 | Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR; |
2549 | break; |
2550 | case NVPTXISD::Tex2DArrayFloatFloatGrad: |
2551 | Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR; |
2552 | break; |
2553 | case NVPTXISD::Tex2DArrayS32S32: |
2554 | Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR; |
2555 | break; |
2556 | case NVPTXISD::Tex2DArrayS32Float: |
2557 | Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR; |
2558 | break; |
2559 | case NVPTXISD::Tex2DArrayS32FloatLevel: |
2560 | Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR; |
2561 | break; |
2562 | case NVPTXISD::Tex2DArrayS32FloatGrad: |
2563 | Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR; |
2564 | break; |
2565 | case NVPTXISD::Tex2DArrayU32S32: |
2566 | Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR; |
2567 | break; |
2568 | case NVPTXISD::Tex2DArrayU32Float: |
2569 | Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR; |
2570 | break; |
2571 | case NVPTXISD::Tex2DArrayU32FloatLevel: |
2572 | Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR; |
2573 | break; |
2574 | case NVPTXISD::Tex2DArrayU32FloatGrad: |
2575 | Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR; |
2576 | break; |
2577 | case NVPTXISD::Tex3DFloatS32: |
2578 | Opc = NVPTX::TEX_3D_F32_S32_RR; |
2579 | break; |
2580 | case NVPTXISD::Tex3DFloatFloat: |
2581 | Opc = NVPTX::TEX_3D_F32_F32_RR; |
2582 | break; |
2583 | case NVPTXISD::Tex3DFloatFloatLevel: |
2584 | Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR; |
2585 | break; |
2586 | case NVPTXISD::Tex3DFloatFloatGrad: |
2587 | Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR; |
2588 | break; |
2589 | case NVPTXISD::Tex3DS32S32: |
2590 | Opc = NVPTX::TEX_3D_S32_S32_RR; |
2591 | break; |
2592 | case NVPTXISD::Tex3DS32Float: |
2593 | Opc = NVPTX::TEX_3D_S32_F32_RR; |
2594 | break; |
2595 | case NVPTXISD::Tex3DS32FloatLevel: |
2596 | Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR; |
2597 | break; |
2598 | case NVPTXISD::Tex3DS32FloatGrad: |
2599 | Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR; |
2600 | break; |
2601 | case NVPTXISD::Tex3DU32S32: |
2602 | Opc = NVPTX::TEX_3D_U32_S32_RR; |
2603 | break; |
2604 | case NVPTXISD::Tex3DU32Float: |
2605 | Opc = NVPTX::TEX_3D_U32_F32_RR; |
2606 | break; |
2607 | case NVPTXISD::Tex3DU32FloatLevel: |
2608 | Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR; |
2609 | break; |
2610 | case NVPTXISD::Tex3DU32FloatGrad: |
2611 | Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR; |
2612 | break; |
2613 | case NVPTXISD::TexCubeFloatFloat: |
2614 | Opc = NVPTX::TEX_CUBE_F32_F32_RR; |
2615 | break; |
2616 | case NVPTXISD::TexCubeFloatFloatLevel: |
2617 | Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR; |
2618 | break; |
2619 | case NVPTXISD::TexCubeS32Float: |
2620 | Opc = NVPTX::TEX_CUBE_S32_F32_RR; |
2621 | break; |
2622 | case NVPTXISD::TexCubeS32FloatLevel: |
2623 | Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR; |
2624 | break; |
2625 | case NVPTXISD::TexCubeU32Float: |
2626 | Opc = NVPTX::TEX_CUBE_U32_F32_RR; |
2627 | break; |
2628 | case NVPTXISD::TexCubeU32FloatLevel: |
2629 | Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR; |
2630 | break; |
2631 | case NVPTXISD::TexCubeArrayFloatFloat: |
2632 | Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR; |
2633 | break; |
2634 | case NVPTXISD::TexCubeArrayFloatFloatLevel: |
2635 | Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR; |
2636 | break; |
2637 | case NVPTXISD::TexCubeArrayS32Float: |
2638 | Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR; |
2639 | break; |
2640 | case NVPTXISD::TexCubeArrayS32FloatLevel: |
2641 | Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR; |
2642 | break; |
2643 | case NVPTXISD::TexCubeArrayU32Float: |
2644 | Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR; |
2645 | break; |
2646 | case NVPTXISD::TexCubeArrayU32FloatLevel: |
2647 | Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR; |
2648 | break; |
2649 | case NVPTXISD::Tld4R2DFloatFloat: |
2650 | Opc = NVPTX::TLD4_R_2D_F32_F32_RR; |
2651 | break; |
2652 | case NVPTXISD::Tld4G2DFloatFloat: |
2653 | Opc = NVPTX::TLD4_G_2D_F32_F32_RR; |
2654 | break; |
2655 | case NVPTXISD::Tld4B2DFloatFloat: |
2656 | Opc = NVPTX::TLD4_B_2D_F32_F32_RR; |
2657 | break; |
2658 | case NVPTXISD::Tld4A2DFloatFloat: |
2659 | Opc = NVPTX::TLD4_A_2D_F32_F32_RR; |
2660 | break; |
2661 | case NVPTXISD::Tld4R2DS64Float: |
2662 | Opc = NVPTX::TLD4_R_2D_S32_F32_RR; |
2663 | break; |
2664 | case NVPTXISD::Tld4G2DS64Float: |
2665 | Opc = NVPTX::TLD4_G_2D_S32_F32_RR; |
2666 | break; |
2667 | case NVPTXISD::Tld4B2DS64Float: |
2668 | Opc = NVPTX::TLD4_B_2D_S32_F32_RR; |
2669 | break; |
2670 | case NVPTXISD::Tld4A2DS64Float: |
2671 | Opc = NVPTX::TLD4_A_2D_S32_F32_RR; |
2672 | break; |
2673 | case NVPTXISD::Tld4R2DU64Float: |
2674 | Opc = NVPTX::TLD4_R_2D_U32_F32_RR; |
2675 | break; |
2676 | case NVPTXISD::Tld4G2DU64Float: |
2677 | Opc = NVPTX::TLD4_G_2D_U32_F32_RR; |
2678 | break; |
2679 | case NVPTXISD::Tld4B2DU64Float: |
2680 | Opc = NVPTX::TLD4_B_2D_U32_F32_RR; |
2681 | break; |
2682 | case NVPTXISD::Tld4A2DU64Float: |
2683 | Opc = NVPTX::TLD4_A_2D_U32_F32_RR; |
2684 | break; |
2685 | case NVPTXISD::TexUnified1DFloatS32: |
2686 | Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R; |
2687 | break; |
2688 | case NVPTXISD::TexUnified1DFloatFloat: |
2689 | Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R; |
2690 | break; |
2691 | case NVPTXISD::TexUnified1DFloatFloatLevel: |
2692 | Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R; |
2693 | break; |
2694 | case NVPTXISD::TexUnified1DFloatFloatGrad: |
2695 | Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R; |
2696 | break; |
2697 | case NVPTXISD::TexUnified1DS32S32: |
2698 | Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R; |
2699 | break; |
2700 | case NVPTXISD::TexUnified1DS32Float: |
2701 | Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R; |
2702 | break; |
2703 | case NVPTXISD::TexUnified1DS32FloatLevel: |
2704 | Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R; |
2705 | break; |
2706 | case NVPTXISD::TexUnified1DS32FloatGrad: |
2707 | Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R; |
2708 | break; |
2709 | case NVPTXISD::TexUnified1DU32S32: |
2710 | Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R; |
2711 | break; |
2712 | case NVPTXISD::TexUnified1DU32Float: |
2713 | Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R; |
2714 | break; |
2715 | case NVPTXISD::TexUnified1DU32FloatLevel: |
2716 | Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R; |
2717 | break; |
2718 | case NVPTXISD::TexUnified1DU32FloatGrad: |
2719 | Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R; |
2720 | break; |
2721 | case NVPTXISD::TexUnified1DArrayFloatS32: |
2722 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R; |
2723 | break; |
2724 | case NVPTXISD::TexUnified1DArrayFloatFloat: |
2725 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R; |
2726 | break; |
2727 | case NVPTXISD::TexUnified1DArrayFloatFloatLevel: |
2728 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R; |
2729 | break; |
2730 | case NVPTXISD::TexUnified1DArrayFloatFloatGrad: |
2731 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R; |
2732 | break; |
2733 | case NVPTXISD::TexUnified1DArrayS32S32: |
2734 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R; |
2735 | break; |
2736 | case NVPTXISD::TexUnified1DArrayS32Float: |
2737 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R; |
2738 | break; |
2739 | case NVPTXISD::TexUnified1DArrayS32FloatLevel: |
2740 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R; |
2741 | break; |
2742 | case NVPTXISD::TexUnified1DArrayS32FloatGrad: |
2743 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R; |
2744 | break; |
2745 | case NVPTXISD::TexUnified1DArrayU32S32: |
2746 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R; |
2747 | break; |
2748 | case NVPTXISD::TexUnified1DArrayU32Float: |
2749 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R; |
2750 | break; |
2751 | case NVPTXISD::TexUnified1DArrayU32FloatLevel: |
2752 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R; |
2753 | break; |
2754 | case NVPTXISD::TexUnified1DArrayU32FloatGrad: |
2755 | Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R; |
2756 | break; |
2757 | case NVPTXISD::TexUnified2DFloatS32: |
2758 | Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R; |
2759 | break; |
2760 | case NVPTXISD::TexUnified2DFloatFloat: |
2761 | Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R; |
2762 | break; |
2763 | case NVPTXISD::TexUnified2DFloatFloatLevel: |
2764 | Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R; |
2765 | break; |
2766 | case NVPTXISD::TexUnified2DFloatFloatGrad: |
2767 | Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R; |
2768 | break; |
2769 | case NVPTXISD::TexUnified2DS32S32: |
2770 | Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R; |
2771 | break; |
2772 | case NVPTXISD::TexUnified2DS32Float: |
2773 | Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R; |
2774 | break; |
2775 | case NVPTXISD::TexUnified2DS32FloatLevel: |
2776 | Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R; |
2777 | break; |
2778 | case NVPTXISD::TexUnified2DS32FloatGrad: |
2779 | Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R; |
2780 | break; |
2781 | case NVPTXISD::TexUnified2DU32S32: |
2782 | Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R; |
2783 | break; |
2784 | case NVPTXISD::TexUnified2DU32Float: |
2785 | Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R; |
2786 | break; |
2787 | case NVPTXISD::TexUnified2DU32FloatLevel: |
2788 | Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R; |
2789 | break; |
2790 | case NVPTXISD::TexUnified2DU32FloatGrad: |
2791 | Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R; |
2792 | break; |
2793 | case NVPTXISD::TexUnified2DArrayFloatS32: |
2794 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R; |
2795 | break; |
2796 | case NVPTXISD::TexUnified2DArrayFloatFloat: |
2797 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R; |
2798 | break; |
2799 | case NVPTXISD::TexUnified2DArrayFloatFloatLevel: |
2800 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R; |
2801 | break; |
2802 | case NVPTXISD::TexUnified2DArrayFloatFloatGrad: |
2803 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R; |
2804 | break; |
2805 | case NVPTXISD::TexUnified2DArrayS32S32: |
2806 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R; |
2807 | break; |
2808 | case NVPTXISD::TexUnified2DArrayS32Float: |
2809 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R; |
2810 | break; |
2811 | case NVPTXISD::TexUnified2DArrayS32FloatLevel: |
2812 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R; |
2813 | break; |
2814 | case NVPTXISD::TexUnified2DArrayS32FloatGrad: |
2815 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R; |
2816 | break; |
2817 | case NVPTXISD::TexUnified2DArrayU32S32: |
2818 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R; |
2819 | break; |
2820 | case NVPTXISD::TexUnified2DArrayU32Float: |
2821 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R; |
2822 | break; |
2823 | case NVPTXISD::TexUnified2DArrayU32FloatLevel: |
2824 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R; |
2825 | break; |
2826 | case NVPTXISD::TexUnified2DArrayU32FloatGrad: |
2827 | Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R; |
2828 | break; |
2829 | case NVPTXISD::TexUnified3DFloatS32: |
2830 | Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R; |
2831 | break; |
2832 | case NVPTXISD::TexUnified3DFloatFloat: |
2833 | Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R; |
2834 | break; |
2835 | case NVPTXISD::TexUnified3DFloatFloatLevel: |
2836 | Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R; |
2837 | break; |
2838 | case NVPTXISD::TexUnified3DFloatFloatGrad: |
2839 | Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R; |
2840 | break; |
2841 | case NVPTXISD::TexUnified3DS32S32: |
2842 | Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R; |
2843 | break; |
2844 | case NVPTXISD::TexUnified3DS32Float: |
2845 | Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R; |
2846 | break; |
2847 | case NVPTXISD::TexUnified3DS32FloatLevel: |
2848 | Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R; |
2849 | break; |
2850 | case NVPTXISD::TexUnified3DS32FloatGrad: |
2851 | Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R; |
2852 | break; |
2853 | case NVPTXISD::TexUnified3DU32S32: |
2854 | Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R; |
2855 | break; |
2856 | case NVPTXISD::TexUnified3DU32Float: |
2857 | Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R; |
2858 | break; |
2859 | case NVPTXISD::TexUnified3DU32FloatLevel: |
2860 | Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R; |
2861 | break; |
2862 | case NVPTXISD::TexUnified3DU32FloatGrad: |
2863 | Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R; |
2864 | break; |
2865 | case NVPTXISD::TexUnifiedCubeFloatFloat: |
2866 | Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R; |
2867 | break; |
2868 | case NVPTXISD::TexUnifiedCubeFloatFloatLevel: |
2869 | Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R; |
2870 | break; |
2871 | case NVPTXISD::TexUnifiedCubeS32Float: |
2872 | Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R; |
2873 | break; |
2874 | case NVPTXISD::TexUnifiedCubeS32FloatLevel: |
2875 | Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R; |
2876 | break; |
2877 | case NVPTXISD::TexUnifiedCubeU32Float: |
2878 | Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R; |
2879 | break; |
2880 | case NVPTXISD::TexUnifiedCubeU32FloatLevel: |
2881 | Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R; |
2882 | break; |
2883 | case NVPTXISD::TexUnifiedCubeArrayFloatFloat: |
2884 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R; |
2885 | break; |
2886 | case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: |
2887 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R; |
2888 | break; |
2889 | case NVPTXISD::TexUnifiedCubeArrayS32Float: |
2890 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R; |
2891 | break; |
2892 | case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: |
2893 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R; |
2894 | break; |
2895 | case NVPTXISD::TexUnifiedCubeArrayU32Float: |
2896 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R; |
2897 | break; |
2898 | case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: |
2899 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R; |
2900 | break; |
2901 | case NVPTXISD::Tld4UnifiedR2DFloatFloat: |
2902 | Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R; |
2903 | break; |
2904 | case NVPTXISD::Tld4UnifiedG2DFloatFloat: |
2905 | Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R; |
2906 | break; |
2907 | case NVPTXISD::Tld4UnifiedB2DFloatFloat: |
2908 | Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R; |
2909 | break; |
2910 | case NVPTXISD::Tld4UnifiedA2DFloatFloat: |
2911 | Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R; |
2912 | break; |
2913 | case NVPTXISD::Tld4UnifiedR2DS64Float: |
2914 | Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R; |
2915 | break; |
2916 | case NVPTXISD::Tld4UnifiedG2DS64Float: |
2917 | Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R; |
2918 | break; |
2919 | case NVPTXISD::Tld4UnifiedB2DS64Float: |
2920 | Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R; |
2921 | break; |
2922 | case NVPTXISD::Tld4UnifiedA2DS64Float: |
2923 | Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R; |
2924 | break; |
2925 | case NVPTXISD::Tld4UnifiedR2DU64Float: |
2926 | Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R; |
2927 | break; |
2928 | case NVPTXISD::Tld4UnifiedG2DU64Float: |
2929 | Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R; |
2930 | break; |
2931 | case NVPTXISD::Tld4UnifiedB2DU64Float: |
2932 | Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R; |
2933 | break; |
2934 | case NVPTXISD::Tld4UnifiedA2DU64Float: |
2935 | Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R; |
2936 | break; |
2937 | case NVPTXISD::TexUnifiedCubeFloatFloatGrad: |
2938 | Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R; |
2939 | break; |
2940 | case NVPTXISD::TexUnifiedCubeS32FloatGrad: |
2941 | Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R; |
2942 | break; |
2943 | case NVPTXISD::TexUnifiedCubeU32FloatGrad: |
2944 | Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R; |
2945 | break; |
2946 | case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: |
2947 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R; |
2948 | break; |
2949 | case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: |
2950 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R; |
2951 | break; |
2952 | case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: |
2953 | Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R; |
2954 | break; |
2955 | } |
2956 | |
2957 | // Copy over operands |
2958 | SmallVector<SDValue, 8> Ops(drop_begin(RangeOrContainer: N->ops())); |
2959 | Ops.push_back(Elt: N->getOperand(Num: 0)); // Move chain to the back. |
2960 | |
2961 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VTs: N->getVTList(), Ops)); |
2962 | return true; |
2963 | } |
2964 | |
2965 | bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { |
2966 | unsigned Opc = 0; |
2967 | switch (N->getOpcode()) { |
2968 | default: return false; |
2969 | case NVPTXISD::Suld1DI8Clamp: |
2970 | Opc = NVPTX::SULD_1D_I8_CLAMP_R; |
2971 | break; |
2972 | case NVPTXISD::Suld1DI16Clamp: |
2973 | Opc = NVPTX::SULD_1D_I16_CLAMP_R; |
2974 | break; |
2975 | case NVPTXISD::Suld1DI32Clamp: |
2976 | Opc = NVPTX::SULD_1D_I32_CLAMP_R; |
2977 | break; |
2978 | case NVPTXISD::Suld1DI64Clamp: |
2979 | Opc = NVPTX::SULD_1D_I64_CLAMP_R; |
2980 | break; |
2981 | case NVPTXISD::Suld1DV2I8Clamp: |
2982 | Opc = NVPTX::SULD_1D_V2I8_CLAMP_R; |
2983 | break; |
2984 | case NVPTXISD::Suld1DV2I16Clamp: |
2985 | Opc = NVPTX::SULD_1D_V2I16_CLAMP_R; |
2986 | break; |
2987 | case NVPTXISD::Suld1DV2I32Clamp: |
2988 | Opc = NVPTX::SULD_1D_V2I32_CLAMP_R; |
2989 | break; |
2990 | case NVPTXISD::Suld1DV2I64Clamp: |
2991 | Opc = NVPTX::SULD_1D_V2I64_CLAMP_R; |
2992 | break; |
2993 | case NVPTXISD::Suld1DV4I8Clamp: |
2994 | Opc = NVPTX::SULD_1D_V4I8_CLAMP_R; |
2995 | break; |
2996 | case NVPTXISD::Suld1DV4I16Clamp: |
2997 | Opc = NVPTX::SULD_1D_V4I16_CLAMP_R; |
2998 | break; |
2999 | case NVPTXISD::Suld1DV4I32Clamp: |
3000 | Opc = NVPTX::SULD_1D_V4I32_CLAMP_R; |
3001 | break; |
3002 | case NVPTXISD::Suld1DArrayI8Clamp: |
3003 | Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R; |
3004 | break; |
3005 | case NVPTXISD::Suld1DArrayI16Clamp: |
3006 | Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R; |
3007 | break; |
3008 | case NVPTXISD::Suld1DArrayI32Clamp: |
3009 | Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R; |
3010 | break; |
3011 | case NVPTXISD::Suld1DArrayI64Clamp: |
3012 | Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R; |
3013 | break; |
3014 | case NVPTXISD::Suld1DArrayV2I8Clamp: |
3015 | Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R; |
3016 | break; |
3017 | case NVPTXISD::Suld1DArrayV2I16Clamp: |
3018 | Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R; |
3019 | break; |
3020 | case NVPTXISD::Suld1DArrayV2I32Clamp: |
3021 | Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R; |
3022 | break; |
3023 | case NVPTXISD::Suld1DArrayV2I64Clamp: |
3024 | Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R; |
3025 | break; |
3026 | case NVPTXISD::Suld1DArrayV4I8Clamp: |
3027 | Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R; |
3028 | break; |
3029 | case NVPTXISD::Suld1DArrayV4I16Clamp: |
3030 | Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R; |
3031 | break; |
3032 | case NVPTXISD::Suld1DArrayV4I32Clamp: |
3033 | Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R; |
3034 | break; |
3035 | case NVPTXISD::Suld2DI8Clamp: |
3036 | Opc = NVPTX::SULD_2D_I8_CLAMP_R; |
3037 | break; |
3038 | case NVPTXISD::Suld2DI16Clamp: |
3039 | Opc = NVPTX::SULD_2D_I16_CLAMP_R; |
3040 | break; |
3041 | case NVPTXISD::Suld2DI32Clamp: |
3042 | Opc = NVPTX::SULD_2D_I32_CLAMP_R; |
3043 | break; |
3044 | case NVPTXISD::Suld2DI64Clamp: |
3045 | Opc = NVPTX::SULD_2D_I64_CLAMP_R; |
3046 | break; |
3047 | case NVPTXISD::Suld2DV2I8Clamp: |
3048 | Opc = NVPTX::SULD_2D_V2I8_CLAMP_R; |
3049 | break; |
3050 | case NVPTXISD::Suld2DV2I16Clamp: |
3051 | Opc = NVPTX::SULD_2D_V2I16_CLAMP_R; |
3052 | break; |
3053 | case NVPTXISD::Suld2DV2I32Clamp: |
3054 | Opc = NVPTX::SULD_2D_V2I32_CLAMP_R; |
3055 | break; |
3056 | case NVPTXISD::Suld2DV2I64Clamp: |
3057 | Opc = NVPTX::SULD_2D_V2I64_CLAMP_R; |
3058 | break; |
3059 | case NVPTXISD::Suld2DV4I8Clamp: |
3060 | Opc = NVPTX::SULD_2D_V4I8_CLAMP_R; |
3061 | break; |
3062 | case NVPTXISD::Suld2DV4I16Clamp: |
3063 | Opc = NVPTX::SULD_2D_V4I16_CLAMP_R; |
3064 | break; |
3065 | case NVPTXISD::Suld2DV4I32Clamp: |
3066 | Opc = NVPTX::SULD_2D_V4I32_CLAMP_R; |
3067 | break; |
3068 | case NVPTXISD::Suld2DArrayI8Clamp: |
3069 | Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R; |
3070 | break; |
3071 | case NVPTXISD::Suld2DArrayI16Clamp: |
3072 | Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R; |
3073 | break; |
3074 | case NVPTXISD::Suld2DArrayI32Clamp: |
3075 | Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R; |
3076 | break; |
3077 | case NVPTXISD::Suld2DArrayI64Clamp: |
3078 | Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R; |
3079 | break; |
3080 | case NVPTXISD::Suld2DArrayV2I8Clamp: |
3081 | Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R; |
3082 | break; |
3083 | case NVPTXISD::Suld2DArrayV2I16Clamp: |
3084 | Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R; |
3085 | break; |
3086 | case NVPTXISD::Suld2DArrayV2I32Clamp: |
3087 | Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R; |
3088 | break; |
3089 | case NVPTXISD::Suld2DArrayV2I64Clamp: |
3090 | Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R; |
3091 | break; |
3092 | case NVPTXISD::Suld2DArrayV4I8Clamp: |
3093 | Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R; |
3094 | break; |
3095 | case NVPTXISD::Suld2DArrayV4I16Clamp: |
3096 | Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R; |
3097 | break; |
3098 | case NVPTXISD::Suld2DArrayV4I32Clamp: |
3099 | Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R; |
3100 | break; |
3101 | case NVPTXISD::Suld3DI8Clamp: |
3102 | Opc = NVPTX::SULD_3D_I8_CLAMP_R; |
3103 | break; |
3104 | case NVPTXISD::Suld3DI16Clamp: |
3105 | Opc = NVPTX::SULD_3D_I16_CLAMP_R; |
3106 | break; |
3107 | case NVPTXISD::Suld3DI32Clamp: |
3108 | Opc = NVPTX::SULD_3D_I32_CLAMP_R; |
3109 | break; |
3110 | case NVPTXISD::Suld3DI64Clamp: |
3111 | Opc = NVPTX::SULD_3D_I64_CLAMP_R; |
3112 | break; |
3113 | case NVPTXISD::Suld3DV2I8Clamp: |
3114 | Opc = NVPTX::SULD_3D_V2I8_CLAMP_R; |
3115 | break; |
3116 | case NVPTXISD::Suld3DV2I16Clamp: |
3117 | Opc = NVPTX::SULD_3D_V2I16_CLAMP_R; |
3118 | break; |
3119 | case NVPTXISD::Suld3DV2I32Clamp: |
3120 | Opc = NVPTX::SULD_3D_V2I32_CLAMP_R; |
3121 | break; |
3122 | case NVPTXISD::Suld3DV2I64Clamp: |
3123 | Opc = NVPTX::SULD_3D_V2I64_CLAMP_R; |
3124 | break; |
3125 | case NVPTXISD::Suld3DV4I8Clamp: |
3126 | Opc = NVPTX::SULD_3D_V4I8_CLAMP_R; |
3127 | break; |
3128 | case NVPTXISD::Suld3DV4I16Clamp: |
3129 | Opc = NVPTX::SULD_3D_V4I16_CLAMP_R; |
3130 | break; |
3131 | case NVPTXISD::Suld3DV4I32Clamp: |
3132 | Opc = NVPTX::SULD_3D_V4I32_CLAMP_R; |
3133 | break; |
3134 | case NVPTXISD::Suld1DI8Trap: |
3135 | Opc = NVPTX::SULD_1D_I8_TRAP_R; |
3136 | break; |
3137 | case NVPTXISD::Suld1DI16Trap: |
3138 | Opc = NVPTX::SULD_1D_I16_TRAP_R; |
3139 | break; |
3140 | case NVPTXISD::Suld1DI32Trap: |
3141 | Opc = NVPTX::SULD_1D_I32_TRAP_R; |
3142 | break; |
3143 | case NVPTXISD::Suld1DI64Trap: |
3144 | Opc = NVPTX::SULD_1D_I64_TRAP_R; |
3145 | break; |
3146 | case NVPTXISD::Suld1DV2I8Trap: |
3147 | Opc = NVPTX::SULD_1D_V2I8_TRAP_R; |
3148 | break; |
3149 | case NVPTXISD::Suld1DV2I16Trap: |
3150 | Opc = NVPTX::SULD_1D_V2I16_TRAP_R; |
3151 | break; |
3152 | case NVPTXISD::Suld1DV2I32Trap: |
3153 | Opc = NVPTX::SULD_1D_V2I32_TRAP_R; |
3154 | break; |
3155 | case NVPTXISD::Suld1DV2I64Trap: |
3156 | Opc = NVPTX::SULD_1D_V2I64_TRAP_R; |
3157 | break; |
3158 | case NVPTXISD::Suld1DV4I8Trap: |
3159 | Opc = NVPTX::SULD_1D_V4I8_TRAP_R; |
3160 | break; |
3161 | case NVPTXISD::Suld1DV4I16Trap: |
3162 | Opc = NVPTX::SULD_1D_V4I16_TRAP_R; |
3163 | break; |
3164 | case NVPTXISD::Suld1DV4I32Trap: |
3165 | Opc = NVPTX::SULD_1D_V4I32_TRAP_R; |
3166 | break; |
3167 | case NVPTXISD::Suld1DArrayI8Trap: |
3168 | Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R; |
3169 | break; |
3170 | case NVPTXISD::Suld1DArrayI16Trap: |
3171 | Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R; |
3172 | break; |
3173 | case NVPTXISD::Suld1DArrayI32Trap: |
3174 | Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R; |
3175 | break; |
3176 | case NVPTXISD::Suld1DArrayI64Trap: |
3177 | Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R; |
3178 | break; |
3179 | case NVPTXISD::Suld1DArrayV2I8Trap: |
3180 | Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R; |
3181 | break; |
3182 | case NVPTXISD::Suld1DArrayV2I16Trap: |
3183 | Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R; |
3184 | break; |
3185 | case NVPTXISD::Suld1DArrayV2I32Trap: |
3186 | Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R; |
3187 | break; |
3188 | case NVPTXISD::Suld1DArrayV2I64Trap: |
3189 | Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R; |
3190 | break; |
3191 | case NVPTXISD::Suld1DArrayV4I8Trap: |
3192 | Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R; |
3193 | break; |
3194 | case NVPTXISD::Suld1DArrayV4I16Trap: |
3195 | Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R; |
3196 | break; |
3197 | case NVPTXISD::Suld1DArrayV4I32Trap: |
3198 | Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R; |
3199 | break; |
3200 | case NVPTXISD::Suld2DI8Trap: |
3201 | Opc = NVPTX::SULD_2D_I8_TRAP_R; |
3202 | break; |
3203 | case NVPTXISD::Suld2DI16Trap: |
3204 | Opc = NVPTX::SULD_2D_I16_TRAP_R; |
3205 | break; |
3206 | case NVPTXISD::Suld2DI32Trap: |
3207 | Opc = NVPTX::SULD_2D_I32_TRAP_R; |
3208 | break; |
3209 | case NVPTXISD::Suld2DI64Trap: |
3210 | Opc = NVPTX::SULD_2D_I64_TRAP_R; |
3211 | break; |
3212 | case NVPTXISD::Suld2DV2I8Trap: |
3213 | Opc = NVPTX::SULD_2D_V2I8_TRAP_R; |
3214 | break; |
3215 | case NVPTXISD::Suld2DV2I16Trap: |
3216 | Opc = NVPTX::SULD_2D_V2I16_TRAP_R; |
3217 | break; |
3218 | case NVPTXISD::Suld2DV2I32Trap: |
3219 | Opc = NVPTX::SULD_2D_V2I32_TRAP_R; |
3220 | break; |
3221 | case NVPTXISD::Suld2DV2I64Trap: |
3222 | Opc = NVPTX::SULD_2D_V2I64_TRAP_R; |
3223 | break; |
3224 | case NVPTXISD::Suld2DV4I8Trap: |
3225 | Opc = NVPTX::SULD_2D_V4I8_TRAP_R; |
3226 | break; |
3227 | case NVPTXISD::Suld2DV4I16Trap: |
3228 | Opc = NVPTX::SULD_2D_V4I16_TRAP_R; |
3229 | break; |
3230 | case NVPTXISD::Suld2DV4I32Trap: |
3231 | Opc = NVPTX::SULD_2D_V4I32_TRAP_R; |
3232 | break; |
3233 | case NVPTXISD::Suld2DArrayI8Trap: |
3234 | Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R; |
3235 | break; |
3236 | case NVPTXISD::Suld2DArrayI16Trap: |
3237 | Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R; |
3238 | break; |
3239 | case NVPTXISD::Suld2DArrayI32Trap: |
3240 | Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R; |
3241 | break; |
3242 | case NVPTXISD::Suld2DArrayI64Trap: |
3243 | Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R; |
3244 | break; |
3245 | case NVPTXISD::Suld2DArrayV2I8Trap: |
3246 | Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R; |
3247 | break; |
3248 | case NVPTXISD::Suld2DArrayV2I16Trap: |
3249 | Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R; |
3250 | break; |
3251 | case NVPTXISD::Suld2DArrayV2I32Trap: |
3252 | Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R; |
3253 | break; |
3254 | case NVPTXISD::Suld2DArrayV2I64Trap: |
3255 | Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R; |
3256 | break; |
3257 | case NVPTXISD::Suld2DArrayV4I8Trap: |
3258 | Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R; |
3259 | break; |
3260 | case NVPTXISD::Suld2DArrayV4I16Trap: |
3261 | Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R; |
3262 | break; |
3263 | case NVPTXISD::Suld2DArrayV4I32Trap: |
3264 | Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R; |
3265 | break; |
3266 | case NVPTXISD::Suld3DI8Trap: |
3267 | Opc = NVPTX::SULD_3D_I8_TRAP_R; |
3268 | break; |
3269 | case NVPTXISD::Suld3DI16Trap: |
3270 | Opc = NVPTX::SULD_3D_I16_TRAP_R; |
3271 | break; |
3272 | case NVPTXISD::Suld3DI32Trap: |
3273 | Opc = NVPTX::SULD_3D_I32_TRAP_R; |
3274 | break; |
3275 | case NVPTXISD::Suld3DI64Trap: |
3276 | Opc = NVPTX::SULD_3D_I64_TRAP_R; |
3277 | break; |
3278 | case NVPTXISD::Suld3DV2I8Trap: |
3279 | Opc = NVPTX::SULD_3D_V2I8_TRAP_R; |
3280 | break; |
3281 | case NVPTXISD::Suld3DV2I16Trap: |
3282 | Opc = NVPTX::SULD_3D_V2I16_TRAP_R; |
3283 | break; |
3284 | case NVPTXISD::Suld3DV2I32Trap: |
3285 | Opc = NVPTX::SULD_3D_V2I32_TRAP_R; |
3286 | break; |
3287 | case NVPTXISD::Suld3DV2I64Trap: |
3288 | Opc = NVPTX::SULD_3D_V2I64_TRAP_R; |
3289 | break; |
3290 | case NVPTXISD::Suld3DV4I8Trap: |
3291 | Opc = NVPTX::SULD_3D_V4I8_TRAP_R; |
3292 | break; |
3293 | case NVPTXISD::Suld3DV4I16Trap: |
3294 | Opc = NVPTX::SULD_3D_V4I16_TRAP_R; |
3295 | break; |
3296 | case NVPTXISD::Suld3DV4I32Trap: |
3297 | Opc = NVPTX::SULD_3D_V4I32_TRAP_R; |
3298 | break; |
3299 | case NVPTXISD::Suld1DI8Zero: |
3300 | Opc = NVPTX::SULD_1D_I8_ZERO_R; |
3301 | break; |
3302 | case NVPTXISD::Suld1DI16Zero: |
3303 | Opc = NVPTX::SULD_1D_I16_ZERO_R; |
3304 | break; |
3305 | case NVPTXISD::Suld1DI32Zero: |
3306 | Opc = NVPTX::SULD_1D_I32_ZERO_R; |
3307 | break; |
3308 | case NVPTXISD::Suld1DI64Zero: |
3309 | Opc = NVPTX::SULD_1D_I64_ZERO_R; |
3310 | break; |
3311 | case NVPTXISD::Suld1DV2I8Zero: |
3312 | Opc = NVPTX::SULD_1D_V2I8_ZERO_R; |
3313 | break; |
3314 | case NVPTXISD::Suld1DV2I16Zero: |
3315 | Opc = NVPTX::SULD_1D_V2I16_ZERO_R; |
3316 | break; |
3317 | case NVPTXISD::Suld1DV2I32Zero: |
3318 | Opc = NVPTX::SULD_1D_V2I32_ZERO_R; |
3319 | break; |
3320 | case NVPTXISD::Suld1DV2I64Zero: |
3321 | Opc = NVPTX::SULD_1D_V2I64_ZERO_R; |
3322 | break; |
3323 | case NVPTXISD::Suld1DV4I8Zero: |
3324 | Opc = NVPTX::SULD_1D_V4I8_ZERO_R; |
3325 | break; |
3326 | case NVPTXISD::Suld1DV4I16Zero: |
3327 | Opc = NVPTX::SULD_1D_V4I16_ZERO_R; |
3328 | break; |
3329 | case NVPTXISD::Suld1DV4I32Zero: |
3330 | Opc = NVPTX::SULD_1D_V4I32_ZERO_R; |
3331 | break; |
3332 | case NVPTXISD::Suld1DArrayI8Zero: |
3333 | Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R; |
3334 | break; |
3335 | case NVPTXISD::Suld1DArrayI16Zero: |
3336 | Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R; |
3337 | break; |
3338 | case NVPTXISD::Suld1DArrayI32Zero: |
3339 | Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R; |
3340 | break; |
3341 | case NVPTXISD::Suld1DArrayI64Zero: |
3342 | Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R; |
3343 | break; |
3344 | case NVPTXISD::Suld1DArrayV2I8Zero: |
3345 | Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R; |
3346 | break; |
3347 | case NVPTXISD::Suld1DArrayV2I16Zero: |
3348 | Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R; |
3349 | break; |
3350 | case NVPTXISD::Suld1DArrayV2I32Zero: |
3351 | Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R; |
3352 | break; |
3353 | case NVPTXISD::Suld1DArrayV2I64Zero: |
3354 | Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R; |
3355 | break; |
3356 | case NVPTXISD::Suld1DArrayV4I8Zero: |
3357 | Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R; |
3358 | break; |
3359 | case NVPTXISD::Suld1DArrayV4I16Zero: |
3360 | Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R; |
3361 | break; |
3362 | case NVPTXISD::Suld1DArrayV4I32Zero: |
3363 | Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R; |
3364 | break; |
3365 | case NVPTXISD::Suld2DI8Zero: |
3366 | Opc = NVPTX::SULD_2D_I8_ZERO_R; |
3367 | break; |
3368 | case NVPTXISD::Suld2DI16Zero: |
3369 | Opc = NVPTX::SULD_2D_I16_ZERO_R; |
3370 | break; |
3371 | case NVPTXISD::Suld2DI32Zero: |
3372 | Opc = NVPTX::SULD_2D_I32_ZERO_R; |
3373 | break; |
3374 | case NVPTXISD::Suld2DI64Zero: |
3375 | Opc = NVPTX::SULD_2D_I64_ZERO_R; |
3376 | break; |
3377 | case NVPTXISD::Suld2DV2I8Zero: |
3378 | Opc = NVPTX::SULD_2D_V2I8_ZERO_R; |
3379 | break; |
3380 | case NVPTXISD::Suld2DV2I16Zero: |
3381 | Opc = NVPTX::SULD_2D_V2I16_ZERO_R; |
3382 | break; |
3383 | case NVPTXISD::Suld2DV2I32Zero: |
3384 | Opc = NVPTX::SULD_2D_V2I32_ZERO_R; |
3385 | break; |
3386 | case NVPTXISD::Suld2DV2I64Zero: |
3387 | Opc = NVPTX::SULD_2D_V2I64_ZERO_R; |
3388 | break; |
3389 | case NVPTXISD::Suld2DV4I8Zero: |
3390 | Opc = NVPTX::SULD_2D_V4I8_ZERO_R; |
3391 | break; |
3392 | case NVPTXISD::Suld2DV4I16Zero: |
3393 | Opc = NVPTX::SULD_2D_V4I16_ZERO_R; |
3394 | break; |
3395 | case NVPTXISD::Suld2DV4I32Zero: |
3396 | Opc = NVPTX::SULD_2D_V4I32_ZERO_R; |
3397 | break; |
3398 | case NVPTXISD::Suld2DArrayI8Zero: |
3399 | Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R; |
3400 | break; |
3401 | case NVPTXISD::Suld2DArrayI16Zero: |
3402 | Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R; |
3403 | break; |
3404 | case NVPTXISD::Suld2DArrayI32Zero: |
3405 | Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R; |
3406 | break; |
3407 | case NVPTXISD::Suld2DArrayI64Zero: |
3408 | Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R; |
3409 | break; |
3410 | case NVPTXISD::Suld2DArrayV2I8Zero: |
3411 | Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R; |
3412 | break; |
3413 | case NVPTXISD::Suld2DArrayV2I16Zero: |
3414 | Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R; |
3415 | break; |
3416 | case NVPTXISD::Suld2DArrayV2I32Zero: |
3417 | Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R; |
3418 | break; |
3419 | case NVPTXISD::Suld2DArrayV2I64Zero: |
3420 | Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R; |
3421 | break; |
3422 | case NVPTXISD::Suld2DArrayV4I8Zero: |
3423 | Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R; |
3424 | break; |
3425 | case NVPTXISD::Suld2DArrayV4I16Zero: |
3426 | Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R; |
3427 | break; |
3428 | case NVPTXISD::Suld2DArrayV4I32Zero: |
3429 | Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R; |
3430 | break; |
3431 | case NVPTXISD::Suld3DI8Zero: |
3432 | Opc = NVPTX::SULD_3D_I8_ZERO_R; |
3433 | break; |
3434 | case NVPTXISD::Suld3DI16Zero: |
3435 | Opc = NVPTX::SULD_3D_I16_ZERO_R; |
3436 | break; |
3437 | case NVPTXISD::Suld3DI32Zero: |
3438 | Opc = NVPTX::SULD_3D_I32_ZERO_R; |
3439 | break; |
3440 | case NVPTXISD::Suld3DI64Zero: |
3441 | Opc = NVPTX::SULD_3D_I64_ZERO_R; |
3442 | break; |
3443 | case NVPTXISD::Suld3DV2I8Zero: |
3444 | Opc = NVPTX::SULD_3D_V2I8_ZERO_R; |
3445 | break; |
3446 | case NVPTXISD::Suld3DV2I16Zero: |
3447 | Opc = NVPTX::SULD_3D_V2I16_ZERO_R; |
3448 | break; |
3449 | case NVPTXISD::Suld3DV2I32Zero: |
3450 | Opc = NVPTX::SULD_3D_V2I32_ZERO_R; |
3451 | break; |
3452 | case NVPTXISD::Suld3DV2I64Zero: |
3453 | Opc = NVPTX::SULD_3D_V2I64_ZERO_R; |
3454 | break; |
3455 | case NVPTXISD::Suld3DV4I8Zero: |
3456 | Opc = NVPTX::SULD_3D_V4I8_ZERO_R; |
3457 | break; |
3458 | case NVPTXISD::Suld3DV4I16Zero: |
3459 | Opc = NVPTX::SULD_3D_V4I16_ZERO_R; |
3460 | break; |
3461 | case NVPTXISD::Suld3DV4I32Zero: |
3462 | Opc = NVPTX::SULD_3D_V4I32_ZERO_R; |
3463 | break; |
3464 | } |
3465 | |
3466 | // Copy over operands |
3467 | SmallVector<SDValue, 8> Ops(drop_begin(RangeOrContainer: N->ops())); |
3468 | Ops.push_back(Elt: N->getOperand(Num: 0)); // Move chain to the back. |
3469 | |
3470 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(N), VTs: N->getVTList(), Ops)); |
3471 | return true; |
3472 | } |
3473 | |
3474 | |
3475 | /// SelectBFE - Look for instruction sequences that can be made more efficient |
3476 | /// by using the 'bfe' (bit-field extract) PTX instruction |
3477 | bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { |
3478 | SDLoc DL(N); |
3479 | SDValue LHS = N->getOperand(Num: 0); |
3480 | SDValue RHS = N->getOperand(Num: 1); |
3481 | SDValue Len; |
3482 | SDValue Start; |
3483 | SDValue Val; |
3484 | bool IsSigned = false; |
3485 | |
3486 | if (N->getOpcode() == ISD::AND) { |
3487 | // Canonicalize the operands |
3488 | // We want 'and %val, %mask' |
3489 | if (isa<ConstantSDNode>(Val: LHS) && !isa<ConstantSDNode>(Val: RHS)) { |
3490 | std::swap(a&: LHS, b&: RHS); |
3491 | } |
3492 | |
3493 | ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val&: RHS); |
3494 | if (!Mask) { |
3495 | // We need a constant mask on the RHS of the AND |
3496 | return false; |
3497 | } |
3498 | |
3499 | // Extract the mask bits |
3500 | uint64_t MaskVal = Mask->getZExtValue(); |
3501 | if (!isMask_64(Value: MaskVal)) { |
3502 | // We *could* handle shifted masks here, but doing so would require an |
3503 | // 'and' operation to fix up the low-order bits so we would trade |
3504 | // shr+and for bfe+and, which has the same throughput |
3505 | return false; |
3506 | } |
3507 | |
3508 | // How many bits are in our mask? |
3509 | int64_t NumBits = countr_one(Value: MaskVal); |
3510 | Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32); |
3511 | |
3512 | if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { |
3513 | // We have a 'srl/and' pair, extract the effective start bit and length |
3514 | Val = LHS.getNode()->getOperand(Num: 0); |
3515 | Start = LHS.getNode()->getOperand(Num: 1); |
3516 | ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Val&: Start); |
3517 | if (StartConst) { |
3518 | uint64_t StartVal = StartConst->getZExtValue(); |
3519 | // How many "good" bits do we have left? "good" is defined here as bits |
3520 | // that exist in the original value, not shifted in. |
3521 | int64_t GoodBits = Start.getValueSizeInBits() - StartVal; |
3522 | if (NumBits > GoodBits) { |
3523 | // Do not handle the case where bits have been shifted in. In theory |
3524 | // we could handle this, but the cost is likely higher than just |
3525 | // emitting the srl/and pair. |
3526 | return false; |
3527 | } |
3528 | Start = CurDAG->getTargetConstant(Val: StartVal, DL, VT: MVT::i32); |
3529 | } else { |
3530 | // Do not handle the case where the shift amount (can be zero if no srl |
3531 | // was found) is not constant. We could handle this case, but it would |
3532 | // require run-time logic that would be more expensive than just |
3533 | // emitting the srl/and pair. |
3534 | return false; |
3535 | } |
3536 | } else { |
3537 | // Do not handle the case where the LHS of the and is not a shift. While |
3538 | // it would be trivial to handle this case, it would just transform |
3539 | // 'and' -> 'bfe', but 'and' has higher-throughput. |
3540 | return false; |
3541 | } |
3542 | } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { |
3543 | if (LHS->getOpcode() == ISD::AND) { |
3544 | ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(Val&: RHS); |
3545 | if (!ShiftCnst) { |
3546 | // Shift amount must be constant |
3547 | return false; |
3548 | } |
3549 | |
3550 | uint64_t ShiftAmt = ShiftCnst->getZExtValue(); |
3551 | |
3552 | SDValue AndLHS = LHS->getOperand(Num: 0); |
3553 | SDValue AndRHS = LHS->getOperand(Num: 1); |
3554 | |
3555 | // Canonicalize the AND to have the mask on the RHS |
3556 | if (isa<ConstantSDNode>(Val: AndLHS)) { |
3557 | std::swap(a&: AndLHS, b&: AndRHS); |
3558 | } |
3559 | |
3560 | ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: AndRHS); |
3561 | if (!MaskCnst) { |
3562 | // Mask must be constant |
3563 | return false; |
3564 | } |
3565 | |
3566 | uint64_t MaskVal = MaskCnst->getZExtValue(); |
3567 | uint64_t NumZeros; |
3568 | uint64_t NumBits; |
3569 | if (isMask_64(Value: MaskVal)) { |
3570 | NumZeros = 0; |
3571 | // The number of bits in the result bitfield will be the number of |
3572 | // trailing ones (the AND) minus the number of bits we shift off |
3573 | NumBits = llvm::countr_one(Value: MaskVal) - ShiftAmt; |
3574 | } else if (isShiftedMask_64(Value: MaskVal)) { |
3575 | NumZeros = llvm::countr_zero(Val: MaskVal); |
3576 | unsigned NumOnes = llvm::countr_one(Value: MaskVal >> NumZeros); |
3577 | // The number of bits in the result bitfield will be the number of |
3578 | // trailing zeros plus the number of set bits in the mask minus the |
3579 | // number of bits we shift off |
3580 | NumBits = NumZeros + NumOnes - ShiftAmt; |
3581 | } else { |
3582 | // This is not a mask we can handle |
3583 | return false; |
3584 | } |
3585 | |
3586 | if (ShiftAmt < NumZeros) { |
3587 | // Handling this case would require extra logic that would make this |
3588 | // transformation non-profitable |
3589 | return false; |
3590 | } |
3591 | |
3592 | Val = AndLHS; |
3593 | Start = CurDAG->getTargetConstant(Val: ShiftAmt, DL, VT: MVT::i32); |
3594 | Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32); |
3595 | } else if (LHS->getOpcode() == ISD::SHL) { |
3596 | // Here, we have a pattern like: |
3597 | // |
3598 | // (sra (shl val, NN), MM) |
3599 | // or |
3600 | // (srl (shl val, NN), MM) |
3601 | // |
3602 | // If MM >= NN, we can efficiently optimize this with bfe |
3603 | Val = LHS->getOperand(Num: 0); |
3604 | |
3605 | SDValue ShlRHS = LHS->getOperand(Num: 1); |
3606 | ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(Val&: ShlRHS); |
3607 | if (!ShlCnst) { |
3608 | // Shift amount must be constant |
3609 | return false; |
3610 | } |
3611 | uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); |
3612 | |
3613 | SDValue ShrRHS = RHS; |
3614 | ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(Val&: ShrRHS); |
3615 | if (!ShrCnst) { |
3616 | // Shift amount must be constant |
3617 | return false; |
3618 | } |
3619 | uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); |
3620 | |
3621 | // To avoid extra codegen and be profitable, we need Outer >= Inner |
3622 | if (OuterShiftAmt < InnerShiftAmt) { |
3623 | return false; |
3624 | } |
3625 | |
3626 | // If the outer shift is more than the type size, we have no bitfield to |
3627 | // extract (since we also check that the inner shift is <= the outer shift |
3628 | // then this also implies that the inner shift is < the type size) |
3629 | if (OuterShiftAmt >= Val.getValueSizeInBits()) { |
3630 | return false; |
3631 | } |
3632 | |
3633 | Start = CurDAG->getTargetConstant(Val: OuterShiftAmt - InnerShiftAmt, DL, |
3634 | VT: MVT::i32); |
3635 | Len = CurDAG->getTargetConstant(Val: Val.getValueSizeInBits() - OuterShiftAmt, |
3636 | DL, VT: MVT::i32); |
3637 | |
3638 | if (N->getOpcode() == ISD::SRA) { |
3639 | // If we have a arithmetic right shift, we need to use the signed bfe |
3640 | // variant |
3641 | IsSigned = true; |
3642 | } |
3643 | } else { |
3644 | // No can do... |
3645 | return false; |
3646 | } |
3647 | } else { |
3648 | // No can do... |
3649 | return false; |
3650 | } |
3651 | |
3652 | |
3653 | unsigned Opc; |
3654 | // For the BFE operations we form here from "and" and "srl", always use the |
3655 | // unsigned variants. |
3656 | if (Val.getValueType() == MVT::i32) { |
3657 | if (IsSigned) { |
3658 | Opc = NVPTX::BFE_S32rii; |
3659 | } else { |
3660 | Opc = NVPTX::BFE_U32rii; |
3661 | } |
3662 | } else if (Val.getValueType() == MVT::i64) { |
3663 | if (IsSigned) { |
3664 | Opc = NVPTX::BFE_S64rii; |
3665 | } else { |
3666 | Opc = NVPTX::BFE_U64rii; |
3667 | } |
3668 | } else { |
3669 | // We cannot handle this type |
3670 | return false; |
3671 | } |
3672 | |
3673 | SDValue Ops[] = { |
3674 | Val, Start, Len |
3675 | }; |
3676 | |
3677 | ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: N->getVTList(), Ops)); |
3678 | return true; |
3679 | } |
3680 | |
3681 | // SelectDirectAddr - Match a direct address for DAG. |
3682 | // A direct address could be a globaladdress or externalsymbol. |
3683 | bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { |
3684 | // Return true if TGA or ES. |
3685 | if (N.getOpcode() == ISD::TargetGlobalAddress || |
3686 | N.getOpcode() == ISD::TargetExternalSymbol) { |
3687 | Address = N; |
3688 | return true; |
3689 | } |
3690 | if (N.getOpcode() == NVPTXISD::Wrapper) { |
3691 | Address = N.getOperand(i: 0); |
3692 | return true; |
3693 | } |
3694 | // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol |
3695 | if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(Val&: N)) { |
3696 | if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && |
3697 | CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && |
3698 | CastN->getOperand(Num: 0).getOpcode() == NVPTXISD::MoveParam) |
3699 | return SelectDirectAddr(N: CastN->getOperand(Num: 0).getOperand(i: 0), Address); |
3700 | } |
3701 | return false; |
3702 | } |
3703 | |
3704 | // symbol+offset |
3705 | bool NVPTXDAGToDAGISel::SelectADDRsi_imp( |
3706 | SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { |
3707 | if (Addr.getOpcode() == ISD::ADD) { |
3708 | if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1))) { |
3709 | SDValue base = Addr.getOperand(i: 0); |
3710 | if (SelectDirectAddr(N: base, Address&: Base)) { |
3711 | Offset = CurDAG->getTargetConstant(Val: CN->getZExtValue(), DL: SDLoc(OpNode), |
3712 | VT: mvt); |
3713 | return true; |
3714 | } |
3715 | } |
3716 | } |
3717 | return false; |
3718 | } |
3719 | |
3720 | // symbol+offset |
3721 | bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr, |
3722 | SDValue &Base, SDValue &Offset) { |
3723 | return SelectADDRsi_imp(OpNode, Addr, Base, Offset, mvt: MVT::i32); |
3724 | } |
3725 | |
3726 | // symbol+offset |
3727 | bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr, |
3728 | SDValue &Base, SDValue &Offset) { |
3729 | return SelectADDRsi_imp(OpNode, Addr, Base, Offset, mvt: MVT::i64); |
3730 | } |
3731 | |
3732 | // register+offset |
3733 | bool NVPTXDAGToDAGISel::SelectADDRri_imp( |
3734 | SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { |
3735 | if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Val&: Addr)) { |
3736 | Base = CurDAG->getTargetFrameIndex(FI: FIN->getIndex(), VT: mvt); |
3737 | Offset = CurDAG->getTargetConstant(Val: 0, DL: SDLoc(OpNode), VT: mvt); |
3738 | return true; |
3739 | } |
3740 | if (Addr.getOpcode() == ISD::TargetExternalSymbol || |
3741 | Addr.getOpcode() == ISD::TargetGlobalAddress) |
3742 | return false; // direct calls. |
3743 | |
3744 | if (Addr.getOpcode() == ISD::ADD) { |
3745 | if (SelectDirectAddr(N: Addr.getOperand(i: 0), Address&: Addr)) { |
3746 | return false; |
3747 | } |
3748 | if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1))) { |
3749 | if (FrameIndexSDNode *FIN = |
3750 | dyn_cast<FrameIndexSDNode>(Val: Addr.getOperand(i: 0))) |
3751 | // Constant offset from frame ref. |
3752 | Base = CurDAG->getTargetFrameIndex(FI: FIN->getIndex(), VT: mvt); |
3753 | else |
3754 | Base = Addr.getOperand(i: 0); |
3755 | |
3756 | // Offset must fit in a 32-bit signed int in PTX [register+offset] address |
3757 | // mode |
3758 | if (!CN->getAPIntValue().isSignedIntN(N: 32)) |
3759 | return false; |
3760 | |
3761 | Offset = CurDAG->getTargetConstant(Val: CN->getSExtValue(), DL: SDLoc(OpNode), |
3762 | VT: MVT::i32); |
3763 | return true; |
3764 | } |
3765 | } |
3766 | return false; |
3767 | } |
3768 | |
3769 | // register+offset |
3770 | bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr, |
3771 | SDValue &Base, SDValue &Offset) { |
3772 | return SelectADDRri_imp(OpNode, Addr, Base, Offset, mvt: MVT::i32); |
3773 | } |
3774 | |
3775 | // register+offset |
3776 | bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, |
3777 | SDValue &Base, SDValue &Offset) { |
3778 | return SelectADDRri_imp(OpNode, Addr, Base, Offset, mvt: MVT::i64); |
3779 | } |
3780 | |
3781 | bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, |
3782 | unsigned int spN) const { |
3783 | const Value *Src = nullptr; |
3784 | if (MemSDNode *mN = dyn_cast<MemSDNode>(Val: N)) { |
3785 | if (spN == 0 && mN->getMemOperand()->getPseudoValue()) |
3786 | return true; |
3787 | Src = mN->getMemOperand()->getValue(); |
3788 | } |
3789 | if (!Src) |
3790 | return false; |
3791 | if (auto *PT = dyn_cast<PointerType>(Val: Src->getType())) |
3792 | return (PT->getAddressSpace() == spN); |
3793 | return false; |
3794 | } |
3795 | |
3796 | /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for |
3797 | /// inline asm expressions. |
3798 | bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( |
3799 | const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, |
3800 | std::vector<SDValue> &OutOps) { |
3801 | SDValue Op0, Op1; |
3802 | switch (ConstraintID) { |
3803 | default: |
3804 | return true; |
3805 | case InlineAsm::ConstraintCode::m: // memory |
3806 | if (SelectDirectAddr(N: Op, Address&: Op0)) { |
3807 | OutOps.push_back(x: Op0); |
3808 | OutOps.push_back(x: CurDAG->getTargetConstant(Val: 0, DL: SDLoc(Op), VT: MVT::i32)); |
3809 | return false; |
3810 | } |
3811 | if (SelectADDRri(OpNode: Op.getNode(), Addr: Op, Base&: Op0, Offset&: Op1)) { |
3812 | OutOps.push_back(x: Op0); |
3813 | OutOps.push_back(x: Op1); |
3814 | return false; |
3815 | } |
3816 | break; |
3817 | } |
3818 | return true; |
3819 | } |
3820 | |
3821 | void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) { |
3822 | // Lower a CopyToReg with two 64-bit inputs |
3823 | // Dst:i128, lo:i64, hi:i64 |
3824 | // |
3825 | // CopyToReg Dst, lo, hi; |
3826 | // |
3827 | // ==> |
3828 | // |
3829 | // tmp = V2I64toI128 {lo, hi}; |
3830 | // CopyToReg Dst, tmp; |
3831 | SDValue Dst = N->getOperand(Num: 1); |
3832 | SDValue Lo = N->getOperand(Num: 2); |
3833 | SDValue Hi = N->getOperand(Num: 3); |
3834 | |
3835 | SDLoc DL(N); |
3836 | SDNode *Mov = |
3837 | CurDAG->getMachineNode(Opcode: NVPTX::V2I64toI128, dl: DL, VT: MVT::i128, Ops: {Lo, Hi}); |
3838 | |
3839 | SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1); |
3840 | NewOps[0] = N->getOperand(Num: 0); |
3841 | NewOps[1] = Dst; |
3842 | NewOps[2] = SDValue(Mov, 0); |
3843 | if (N->getNumOperands() == 5) |
3844 | NewOps[3] = N->getOperand(Num: 4); |
3845 | SDValue NewValue = CurDAG->getNode(Opcode: ISD::CopyToReg, DL, ResultTys: SmallVector<EVT>(N->values()), Ops: NewOps); |
3846 | |
3847 | ReplaceNode(F: N, T: NewValue.getNode()); |
3848 | } |
3849 | |
3850 | void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) { |
3851 | // Lower CopyFromReg from a 128-bit regs to two 64-bit regs |
3852 | // Dst:i128, Src:i128 |
3853 | // |
3854 | // {lo, hi} = CopyFromReg Src |
3855 | // |
3856 | // ==> |
3857 | // |
3858 | // {lo, hi} = I128toV2I64 Src |
3859 | // |
3860 | SDValue Ch = N->getOperand(Num: 0); |
3861 | SDValue Src = N->getOperand(Num: 1); |
3862 | SDValue Glue = N->getOperand(Num: 2); |
3863 | SDLoc DL(N); |
3864 | |
3865 | // Add Glue and Ch to the operands and results to avoid break the execution |
3866 | // order |
3867 | SDNode *Mov = CurDAG->getMachineNode( |
3868 | Opcode: NVPTX::I128toV2I64, dl: DL, |
3869 | ResultTys: {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()}, |
3870 | Ops: {Src, Ch, Glue}); |
3871 | |
3872 | ReplaceNode(F: N, T: Mov); |
3873 | } |
3874 | |
3875 | /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a |
3876 | /// conversion from \p SrcTy to \p DestTy. |
3877 | unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, |
3878 | LoadSDNode *LdNode) { |
3879 | bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD; |
3880 | switch (SrcTy.SimpleTy) { |
3881 | default: |
3882 | llvm_unreachable("Unhandled source type" ); |
3883 | case MVT::i8: |
3884 | switch (DestTy.SimpleTy) { |
3885 | default: |
3886 | llvm_unreachable("Unhandled dest type" ); |
3887 | case MVT::i16: |
3888 | return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; |
3889 | case MVT::i32: |
3890 | return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; |
3891 | case MVT::i64: |
3892 | return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; |
3893 | } |
3894 | case MVT::i16: |
3895 | switch (DestTy.SimpleTy) { |
3896 | default: |
3897 | llvm_unreachable("Unhandled dest type" ); |
3898 | case MVT::i8: |
3899 | return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; |
3900 | case MVT::i32: |
3901 | return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; |
3902 | case MVT::i64: |
3903 | return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; |
3904 | } |
3905 | case MVT::i32: |
3906 | switch (DestTy.SimpleTy) { |
3907 | default: |
3908 | llvm_unreachable("Unhandled dest type" ); |
3909 | case MVT::i8: |
3910 | return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; |
3911 | case MVT::i16: |
3912 | return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; |
3913 | case MVT::i64: |
3914 | return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; |
3915 | } |
3916 | case MVT::i64: |
3917 | switch (DestTy.SimpleTy) { |
3918 | default: |
3919 | llvm_unreachable("Unhandled dest type" ); |
3920 | case MVT::i8: |
3921 | return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; |
3922 | case MVT::i16: |
3923 | return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; |
3924 | case MVT::i32: |
3925 | return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; |
3926 | } |
3927 | case MVT::f16: |
3928 | switch (DestTy.SimpleTy) { |
3929 | default: |
3930 | llvm_unreachable("Unhandled dest type" ); |
3931 | case MVT::f32: |
3932 | return NVPTX::CVT_f32_f16; |
3933 | case MVT::f64: |
3934 | return NVPTX::CVT_f64_f16; |
3935 | } |
3936 | } |
3937 | } |
3938 | |