1 | //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the interfaces that NVPTX uses to lower LLVM code into a |
10 | // selection DAG. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "NVPTXISelLowering.h" |
15 | #include "MCTargetDesc/NVPTXBaseInfo.h" |
16 | #include "NVPTX.h" |
17 | #include "NVPTXSubtarget.h" |
18 | #include "NVPTXTargetMachine.h" |
19 | #include "NVPTXTargetObjectFile.h" |
20 | #include "NVPTXUtilities.h" |
21 | #include "llvm/ADT/APFloat.h" |
22 | #include "llvm/ADT/APInt.h" |
23 | #include "llvm/ADT/STLExtras.h" |
24 | #include "llvm/ADT/SmallVector.h" |
25 | #include "llvm/ADT/StringRef.h" |
26 | #include "llvm/CodeGen/Analysis.h" |
27 | #include "llvm/CodeGen/ISDOpcodes.h" |
28 | #include "llvm/CodeGen/MachineFunction.h" |
29 | #include "llvm/CodeGen/MachineJumpTableInfo.h" |
30 | #include "llvm/CodeGen/MachineMemOperand.h" |
31 | #include "llvm/CodeGen/Register.h" |
32 | #include "llvm/CodeGen/SelectionDAG.h" |
33 | #include "llvm/CodeGen/SelectionDAGNodes.h" |
34 | #include "llvm/CodeGen/TargetCallingConv.h" |
35 | #include "llvm/CodeGen/TargetLowering.h" |
36 | #include "llvm/CodeGen/ValueTypes.h" |
37 | #include "llvm/CodeGenTypes/MachineValueType.h" |
38 | #include "llvm/IR/Argument.h" |
39 | #include "llvm/IR/Attributes.h" |
40 | #include "llvm/IR/Constants.h" |
41 | #include "llvm/IR/DataLayout.h" |
42 | #include "llvm/IR/DerivedTypes.h" |
43 | #include "llvm/IR/DiagnosticInfo.h" |
44 | #include "llvm/IR/FPEnv.h" |
45 | #include "llvm/IR/Function.h" |
46 | #include "llvm/IR/GlobalValue.h" |
47 | #include "llvm/IR/IRBuilder.h" |
48 | #include "llvm/IR/Instruction.h" |
49 | #include "llvm/IR/Instructions.h" |
50 | #include "llvm/IR/IntrinsicsNVPTX.h" |
51 | #include "llvm/IR/Module.h" |
52 | #include "llvm/IR/Type.h" |
53 | #include "llvm/IR/Value.h" |
54 | #include "llvm/Support/Alignment.h" |
55 | #include "llvm/Support/AtomicOrdering.h" |
56 | #include "llvm/Support/Casting.h" |
57 | #include "llvm/Support/CodeGen.h" |
58 | #include "llvm/Support/CommandLine.h" |
59 | #include "llvm/Support/ErrorHandling.h" |
60 | #include "llvm/Support/NVPTXAddrSpace.h" |
61 | #include "llvm/Support/raw_ostream.h" |
62 | #include "llvm/Target/TargetMachine.h" |
63 | #include "llvm/Target/TargetOptions.h" |
64 | #include <algorithm> |
65 | #include <cassert> |
66 | #include <cmath> |
67 | #include <cstdint> |
68 | #include <iterator> |
69 | #include <optional> |
70 | #include <string> |
71 | #include <tuple> |
72 | #include <utility> |
73 | #include <vector> |
74 | |
75 | #define DEBUG_TYPE "nvptx-lower" |
76 | |
77 | using namespace llvm; |
78 | |
79 | static cl::opt<bool> sched4reg( |
80 | "nvptx-sched4reg" , |
81 | cl::desc("NVPTX Specific: schedule for register pressue" ), cl::init(Val: false)); |
82 | |
83 | static cl::opt<unsigned> FMAContractLevelOpt( |
84 | "nvptx-fma-level" , cl::Hidden, |
85 | cl::desc("NVPTX Specific: FMA contraction (0: don't do it" |
86 | " 1: do it 2: do it aggressively" ), |
87 | cl::init(Val: 2)); |
88 | |
89 | static cl::opt<NVPTX::DivPrecisionLevel> UsePrecDivF32( |
90 | "nvptx-prec-divf32" , cl::Hidden, |
91 | cl::desc( |
92 | "NVPTX Specific: Override the precision of the lowering for f32 fdiv" ), |
93 | cl::values( |
94 | clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0" , "Use div.approx" ), |
95 | clEnumValN(NVPTX::DivPrecisionLevel::Full, "1" , "Use div.full" ), |
96 | clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2" , |
97 | "Use IEEE Compliant F32 div.rnd if available (default)" ), |
98 | clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3" , |
99 | "Use IEEE Compliant F32 div.rnd if available, no FTZ" )), |
100 | cl::init(Val: NVPTX::DivPrecisionLevel::IEEE754)); |
101 | |
102 | static cl::opt<bool> UsePrecSqrtF32( |
103 | "nvptx-prec-sqrtf32" , cl::Hidden, |
104 | cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn." ), |
105 | cl::init(Val: true)); |
106 | |
107 | /// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it |
108 | /// does NOT use lg2.approx for log2, so this is disabled by default. |
109 | static cl::opt<bool> UseApproxLog2F32( |
110 | "nvptx-approx-log2f32" , |
111 | cl::desc("NVPTX Specific: whether to use lg2.approx for log2" ), |
112 | cl::init(Val: false)); |
113 | |
114 | static cl::opt<bool> ForceMinByValParamAlign( |
115 | "nvptx-force-min-byval-param-align" , cl::Hidden, |
116 | cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" |
117 | " params of device functions." ), |
118 | cl::init(Val: false)); |
119 | |
120 | NVPTX::DivPrecisionLevel |
121 | NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF, |
122 | const SDNode &N) const { |
123 | // If nvptx-prec-div32=N is used on the command-line, always honor it |
124 | if (UsePrecDivF32.getNumOccurrences() > 0) |
125 | return UsePrecDivF32; |
126 | |
127 | // Otherwise, use div.approx if fast math is enabled |
128 | if (allowUnsafeFPMath(MF)) |
129 | return NVPTX::DivPrecisionLevel::Approx; |
130 | |
131 | const SDNodeFlags Flags = N.getFlags(); |
132 | if (Flags.hasApproximateFuncs()) |
133 | return NVPTX::DivPrecisionLevel::Approx; |
134 | |
135 | return NVPTX::DivPrecisionLevel::IEEE754; |
136 | } |
137 | |
138 | bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF, |
139 | const SDNode *N) const { |
140 | // If nvptx-prec-sqrtf32 is used on the command-line, always honor it |
141 | if (UsePrecSqrtF32.getNumOccurrences() > 0) |
142 | return UsePrecSqrtF32; |
143 | |
144 | // Otherwise, use sqrt.approx if fast math is enabled |
145 | if (allowUnsafeFPMath(MF)) |
146 | return false; |
147 | |
148 | if (N) { |
149 | const SDNodeFlags Flags = N->getFlags(); |
150 | if (Flags.hasApproximateFuncs()) |
151 | return false; |
152 | } |
153 | |
154 | return true; |
155 | } |
156 | |
157 | bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { |
158 | return MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Output == |
159 | DenormalMode::PreserveSign; |
160 | } |
161 | |
162 | static bool IsPTXVectorType(MVT VT) { |
163 | switch (VT.SimpleTy) { |
164 | default: |
165 | return false; |
166 | case MVT::v2i1: |
167 | case MVT::v4i1: |
168 | case MVT::v2i8: |
169 | case MVT::v4i8: |
170 | case MVT::v8i8: // <2 x i8x4> |
171 | case MVT::v16i8: // <4 x i8x4> |
172 | case MVT::v2i16: |
173 | case MVT::v4i16: |
174 | case MVT::v8i16: // <4 x i16x2> |
175 | case MVT::v2i32: |
176 | case MVT::v4i32: |
177 | case MVT::v2i64: |
178 | case MVT::v2f16: |
179 | case MVT::v4f16: |
180 | case MVT::v8f16: // <4 x f16x2> |
181 | case MVT::v2bf16: |
182 | case MVT::v4bf16: |
183 | case MVT::v8bf16: // <4 x bf16x2> |
184 | case MVT::v2f32: |
185 | case MVT::v4f32: |
186 | case MVT::v2f64: |
187 | case MVT::v4i64: |
188 | case MVT::v4f64: |
189 | case MVT::v8i32: |
190 | case MVT::v8f32: |
191 | case MVT::v16f16: // <8 x f16x2> |
192 | case MVT::v16bf16: // <8 x bf16x2> |
193 | case MVT::v16i16: // <8 x i16x2> |
194 | case MVT::v32i8: // <8 x i8x4> |
195 | return true; |
196 | } |
197 | } |
198 | |
199 | static bool Is16bitsType(MVT VT) { |
200 | return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || |
201 | VT.SimpleTy == MVT::i16); |
202 | } |
203 | |
204 | // When legalizing vector loads/stores, this function is called, which does two |
205 | // things: |
206 | // 1. Determines Whether the vector is something we want to custom lower, |
207 | // std::nullopt is returned if we do not want to custom lower it. |
208 | // 2. If we do want to handle it, returns two parameters: |
209 | // - unsigned int NumElts - The number of elements in the final vector |
210 | // - EVT EltVT - The type of the elements in the final vector |
211 | static std::optional<std::pair<unsigned int, MVT>> |
212 | getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { |
213 | if (!VectorEVT.isSimple()) |
214 | return std::nullopt; |
215 | const MVT VectorVT = VectorEVT.getSimpleVT(); |
216 | |
217 | if (!VectorVT.isVector()) { |
218 | if (VectorVT == MVT::i128 || VectorVT == MVT::f128) |
219 | return {{2, MVT::i64}}; |
220 | return std::nullopt; |
221 | } |
222 | |
223 | const MVT EltVT = VectorVT.getVectorElementType(); |
224 | const unsigned NumElts = VectorVT.getVectorNumElements(); |
225 | |
226 | // We only handle "native" vector sizes for now, e.g. <4 x double> is not |
227 | // legal. We can (and should) split that into 2 stores of <2 x double> here |
228 | // but I'm leaving that as a TODO for now. |
229 | switch (VectorVT.SimpleTy) { |
230 | default: |
231 | return std::nullopt; |
232 | case MVT::v4i64: |
233 | case MVT::v4f64: |
234 | case MVT::v8i32: |
235 | case MVT::v8f32: |
236 | // This is a "native" vector type iff the address space is global |
237 | // and the target supports 256-bit loads/stores |
238 | if (!CanLowerTo256Bit) |
239 | return std::nullopt; |
240 | LLVM_FALLTHROUGH; |
241 | case MVT::v2i8: |
242 | case MVT::v2i32: |
243 | case MVT::v2i64: |
244 | case MVT::v2f32: |
245 | case MVT::v2f64: |
246 | case MVT::v4i32: |
247 | case MVT::v4f32: |
248 | // This is a "native" vector type |
249 | return std::pair(NumElts, EltVT); |
250 | case MVT::v16f16: // <8 x f16x2> |
251 | case MVT::v16bf16: // <8 x bf16x2> |
252 | case MVT::v16i16: // <8 x i16x2> |
253 | case MVT::v32i8: // <8 x i8x4> |
254 | // This can be upsized into a "native" vector type iff the address space is |
255 | // global and the target supports 256-bit loads/stores. |
256 | if (!CanLowerTo256Bit) |
257 | return std::nullopt; |
258 | LLVM_FALLTHROUGH; |
259 | case MVT::v2i16: // <1 x i16x2> |
260 | case MVT::v2f16: // <1 x f16x2> |
261 | case MVT::v2bf16: // <1 x bf16x2> |
262 | case MVT::v4i8: // <1 x i8x4> |
263 | case MVT::v4i16: // <2 x i16x2> |
264 | case MVT::v4f16: // <2 x f16x2> |
265 | case MVT::v4bf16: // <2 x bf16x2> |
266 | case MVT::v8i8: // <2 x i8x4> |
267 | case MVT::v8f16: // <4 x f16x2> |
268 | case MVT::v8bf16: // <4 x bf16x2> |
269 | case MVT::v8i16: // <4 x i16x2> |
270 | case MVT::v16i8: // <4 x i8x4> |
271 | // This can be upsized into a "native" vector type. |
272 | // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for |
273 | // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use |
274 | // vectorized loads/stores with the actual element type for i8/i16 as that |
275 | // would require v8/v16 variants that do not exist. |
276 | // In order to load/store such vectors efficiently, here in Type |
277 | // Legalization, we split the vector into word-sized chunks (v2x16/v4i8). |
278 | // Later, we will lower to PTX as vectors of b32. |
279 | |
280 | // Number of elements to pack in one word. |
281 | const unsigned NPerWord = 32 / EltVT.getSizeInBits(); |
282 | |
283 | return std::pair(NumElts / NPerWord, MVT::getVectorVT(VT: EltVT, NumElements: NPerWord)); |
284 | } |
285 | |
286 | llvm_unreachable("All cases in switch should return." ); |
287 | } |
288 | |
289 | /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive |
290 | /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors |
291 | /// into their primitive components. |
292 | /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the |
293 | /// same number of types as the Ins/Outs arrays in LowerFormalArguments, |
294 | /// LowerCall, and LowerReturn. |
295 | static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, |
296 | Type *Ty, SmallVectorImpl<EVT> &ValueVTs, |
297 | SmallVectorImpl<uint64_t> *Offsets = nullptr, |
298 | uint64_t StartingOffset = 0) { |
299 | SmallVector<EVT, 16> TempVTs; |
300 | SmallVector<uint64_t, 16> TempOffsets; |
301 | |
302 | // Special case for i128 - decompose to (i64, i64) |
303 | if (Ty->isIntegerTy(Bitwidth: 128) || Ty->isFP128Ty()) { |
304 | ValueVTs.append(IL: {MVT::i64, MVT::i64}); |
305 | |
306 | if (Offsets) |
307 | Offsets->append(IL: {StartingOffset + 0, StartingOffset + 8}); |
308 | |
309 | return; |
310 | } |
311 | |
312 | // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. |
313 | if (StructType *STy = dyn_cast<StructType>(Val: Ty)) { |
314 | auto const *SL = DL.getStructLayout(Ty: STy); |
315 | auto ElementNum = 0; |
316 | for(auto *EI : STy->elements()) { |
317 | ComputePTXValueVTs(TLI, DL, Ty: EI, ValueVTs, Offsets, |
318 | StartingOffset: StartingOffset + SL->getElementOffset(Idx: ElementNum)); |
319 | ++ElementNum; |
320 | } |
321 | return; |
322 | } |
323 | |
324 | // Given an array type, recursively traverse the elements with custom ComputePTXValueVTs. |
325 | if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) { |
326 | Type *EltTy = ATy->getElementType(); |
327 | uint64_t EltSize = DL.getTypeAllocSize(Ty: EltTy); |
328 | for (int I : llvm::seq<int>(Size: ATy->getNumElements())) |
329 | ComputePTXValueVTs(TLI, DL, Ty: EltTy, ValueVTs, Offsets, StartingOffset: StartingOffset + I * EltSize); |
330 | return; |
331 | } |
332 | |
333 | ComputeValueVTs(TLI, DL, Ty, ValueVTs&: TempVTs, FixedOffsets: &TempOffsets, StartingOffset); |
334 | for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { |
335 | EVT VT = TempVTs[i]; |
336 | uint64_t Off = TempOffsets[i]; |
337 | // Split vectors into individual elements, except for v2f16, which |
338 | // we will pass as a single scalar. |
339 | if (VT.isVector()) { |
340 | unsigned NumElts = VT.getVectorNumElements(); |
341 | EVT EltVT = VT.getVectorElementType(); |
342 | // We require power-of-2 sized vectors because |
343 | // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in |
344 | // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized |
345 | // vectors. |
346 | if ((Is16bitsType(VT: EltVT.getSimpleVT())) && NumElts % 2 == 0 && |
347 | isPowerOf2_32(Value: NumElts)) { |
348 | // Vectors with an even number of f16 elements will be passed to |
349 | // us as an array of v2f16/v2bf16 elements. We must match this so we |
350 | // stay in sync with Ins/Outs. |
351 | switch (EltVT.getSimpleVT().SimpleTy) { |
352 | case MVT::f16: |
353 | EltVT = MVT::v2f16; |
354 | break; |
355 | case MVT::bf16: |
356 | EltVT = MVT::v2bf16; |
357 | break; |
358 | case MVT::i16: |
359 | EltVT = MVT::v2i16; |
360 | break; |
361 | default: |
362 | llvm_unreachable("Unexpected type" ); |
363 | } |
364 | NumElts /= 2; |
365 | } else if (EltVT.getSimpleVT() == MVT::i8 && |
366 | ((NumElts % 4 == 0 && isPowerOf2_32(Value: NumElts)) || |
367 | NumElts == 3)) { |
368 | // v*i8 are formally lowered as v4i8 |
369 | EltVT = MVT::v4i8; |
370 | NumElts = (NumElts + 3) / 4; |
371 | } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) { |
372 | // v2i8 is promoted to v2i16 |
373 | NumElts = 1; |
374 | EltVT = MVT::v2i8; |
375 | } |
376 | for (unsigned j = 0; j != NumElts; ++j) { |
377 | ValueVTs.push_back(Elt: EltVT); |
378 | if (Offsets) |
379 | Offsets->push_back(Elt: Off + j * EltVT.getStoreSize()); |
380 | } |
381 | } else { |
382 | ValueVTs.push_back(Elt: VT); |
383 | if (Offsets) |
384 | Offsets->push_back(Elt: Off); |
385 | } |
386 | } |
387 | } |
388 | |
389 | /// PromoteScalarIntegerPTX |
390 | /// Used to make sure the arguments/returns are suitable for passing |
391 | /// and promote them to a larger size if they're not. |
392 | /// |
393 | /// The promoted type is placed in \p PromoteVT if the function returns true. |
394 | static EVT promoteScalarIntegerPTX(const EVT VT) { |
395 | if (VT.isScalarInteger()) { |
396 | switch (PowerOf2Ceil(A: VT.getFixedSizeInBits())) { |
397 | default: |
398 | llvm_unreachable( |
399 | "Promotion is not suitable for scalars of size larger than 64-bits" ); |
400 | case 1: |
401 | return MVT::i1; |
402 | case 2: |
403 | case 4: |
404 | case 8: |
405 | return MVT::i8; |
406 | case 16: |
407 | return MVT::i16; |
408 | case 32: |
409 | return MVT::i32; |
410 | case 64: |
411 | return MVT::i64; |
412 | } |
413 | } |
414 | return VT; |
415 | } |
416 | |
417 | // Check whether we can merge loads/stores of some of the pieces of a |
418 | // flattened function parameter or return value into a single vector |
419 | // load/store. |
420 | // |
421 | // The flattened parameter is represented as a list of EVTs and |
422 | // offsets, and the whole structure is aligned to ParamAlignment. This |
423 | // function determines whether we can load/store pieces of the |
424 | // parameter starting at index Idx using a single vectorized op of |
425 | // size AccessSize. If so, it returns the number of param pieces |
426 | // covered by the vector op. Otherwise, it returns 1. |
427 | static unsigned CanMergeParamLoadStoresStartingAt( |
428 | unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, |
429 | const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { |
430 | |
431 | // Can't vectorize if param alignment is not sufficient. |
432 | if (ParamAlignment < AccessSize) |
433 | return 1; |
434 | // Can't vectorize if offset is not aligned. |
435 | if (Offsets[Idx] & (AccessSize - 1)) |
436 | return 1; |
437 | |
438 | EVT EltVT = ValueVTs[Idx]; |
439 | unsigned EltSize = EltVT.getStoreSize(); |
440 | |
441 | // Element is too large to vectorize. |
442 | if (EltSize >= AccessSize) |
443 | return 1; |
444 | |
445 | unsigned NumElts = AccessSize / EltSize; |
446 | // Can't vectorize if AccessBytes if not a multiple of EltSize. |
447 | if (AccessSize != EltSize * NumElts) |
448 | return 1; |
449 | |
450 | // We don't have enough elements to vectorize. |
451 | if (Idx + NumElts > ValueVTs.size()) |
452 | return 1; |
453 | |
454 | // PTX ISA can only deal with 2- and 4-element vector ops. |
455 | if (NumElts != 4 && NumElts != 2) |
456 | return 1; |
457 | |
458 | for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { |
459 | // Types do not match. |
460 | if (ValueVTs[j] != EltVT) |
461 | return 1; |
462 | |
463 | // Elements are not contiguous. |
464 | if (Offsets[j] - Offsets[j - 1] != EltSize) |
465 | return 1; |
466 | } |
467 | // OK. We can vectorize ValueVTs[i..i+NumElts) |
468 | return NumElts; |
469 | } |
470 | |
471 | // Computes whether and how we can vectorize the loads/stores of a |
472 | // flattened function parameter or return value. |
473 | // |
474 | // The flattened parameter is represented as the list of ValueVTs and |
475 | // Offsets, and is aligned to ParamAlignment bytes. We return a vector |
476 | // of the same size as ValueVTs indicating how each piece should be |
477 | // loaded/stored (i.e. as a scalar, or as part of a vector |
478 | // load/store). |
479 | static SmallVector<unsigned, 16> |
480 | VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, |
481 | const SmallVectorImpl<uint64_t> &Offsets, |
482 | Align ParamAlignment, bool IsVAArg = false) { |
483 | // Set vector size to match ValueVTs and mark all elements as |
484 | // scalars by default. |
485 | |
486 | if (IsVAArg) |
487 | return SmallVector<unsigned>(ValueVTs.size(), 1); |
488 | |
489 | SmallVector<unsigned, 16> VectorInfo; |
490 | |
491 | const auto GetNumElts = [&](unsigned I) -> unsigned { |
492 | for (const unsigned AccessSize : {16, 8, 4, 2}) { |
493 | const unsigned NumElts = CanMergeParamLoadStoresStartingAt( |
494 | Idx: I, AccessSize, ValueVTs, Offsets, ParamAlignment); |
495 | assert((NumElts == 1 || NumElts == 2 || NumElts == 4) && |
496 | "Unexpected vectorization size" ); |
497 | if (NumElts != 1) |
498 | return NumElts; |
499 | } |
500 | return 1; |
501 | }; |
502 | |
503 | // Check what we can vectorize using 128/64/32-bit accesses. |
504 | for (unsigned I = 0, E = ValueVTs.size(); I != E;) { |
505 | const unsigned NumElts = GetNumElts(I); |
506 | VectorInfo.push_back(Elt: NumElts); |
507 | I += NumElts; |
508 | } |
509 | assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) == |
510 | ValueVTs.size()); |
511 | return VectorInfo; |
512 | } |
513 | |
514 | // NVPTXTargetLowering Constructor. |
515 | NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, |
516 | const NVPTXSubtarget &STI) |
517 | : TargetLowering(TM), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) { |
518 | // always lower memset, memcpy, and memmove intrinsics to load/store |
519 | // instructions, rather |
520 | // then generating calls to memset, mempcy or memmove. |
521 | MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; |
522 | MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; |
523 | MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; |
524 | |
525 | setBooleanContents(ZeroOrNegativeOneBooleanContent); |
526 | setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); |
527 | |
528 | // Jump is Expensive. Don't create extra control flow for 'and', 'or' |
529 | // condition branches. |
530 | setJumpIsExpensive(true); |
531 | |
532 | // Wide divides are _very_ slow. Try to reduce the width of the divide if |
533 | // possible. |
534 | addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32); |
535 | |
536 | // By default, use the Source scheduling |
537 | if (sched4reg) |
538 | setSchedulingPreference(Sched::RegPressure); |
539 | else |
540 | setSchedulingPreference(Sched::Source); |
541 | |
542 | auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, |
543 | LegalizeAction NoF16Action) { |
544 | bool IsOpSupported = STI.allowFP16Math(); |
545 | switch (Op) { |
546 | // Several FP16 instructions are available on sm_80 only. |
547 | case ISD::FMINNUM: |
548 | case ISD::FMAXNUM: |
549 | case ISD::FMAXNUM_IEEE: |
550 | case ISD::FMINNUM_IEEE: |
551 | case ISD::FMAXIMUM: |
552 | case ISD::FMINIMUM: |
553 | IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; |
554 | break; |
555 | case ISD::FEXP2: |
556 | IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70; |
557 | break; |
558 | } |
559 | setOperationAction(Op, VT, Action: IsOpSupported ? Action : NoF16Action); |
560 | }; |
561 | |
562 | auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, |
563 | LegalizeAction NoBF16Action) { |
564 | bool IsOpSupported = STI.hasNativeBF16Support(Opcode: Op); |
565 | setOperationAction( |
566 | Op, VT, Action: IsOpSupported ? Action : NoBF16Action); |
567 | }; |
568 | |
569 | auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, |
570 | LegalizeAction NoI16x2Action) { |
571 | bool IsOpSupported = false; |
572 | // instructions are available on sm_90 only |
573 | switch (Op) { |
574 | case ISD::ADD: |
575 | case ISD::SMAX: |
576 | case ISD::SMIN: |
577 | case ISD::UMIN: |
578 | case ISD::UMAX: |
579 | IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80; |
580 | break; |
581 | } |
582 | setOperationAction(Op, VT, Action: IsOpSupported ? Action : NoI16x2Action); |
583 | }; |
584 | |
585 | addRegisterClass(VT: MVT::i1, RC: &NVPTX::B1RegClass); |
586 | addRegisterClass(VT: MVT::i16, RC: &NVPTX::B16RegClass); |
587 | addRegisterClass(VT: MVT::v2i16, RC: &NVPTX::B32RegClass); |
588 | addRegisterClass(VT: MVT::v4i8, RC: &NVPTX::B32RegClass); |
589 | addRegisterClass(VT: MVT::i32, RC: &NVPTX::B32RegClass); |
590 | addRegisterClass(VT: MVT::i64, RC: &NVPTX::B64RegClass); |
591 | addRegisterClass(VT: MVT::f32, RC: &NVPTX::B32RegClass); |
592 | addRegisterClass(VT: MVT::f64, RC: &NVPTX::B64RegClass); |
593 | addRegisterClass(VT: MVT::f16, RC: &NVPTX::B16RegClass); |
594 | addRegisterClass(VT: MVT::v2f16, RC: &NVPTX::B32RegClass); |
595 | addRegisterClass(VT: MVT::bf16, RC: &NVPTX::B16RegClass); |
596 | addRegisterClass(VT: MVT::v2bf16, RC: &NVPTX::B32RegClass); |
597 | |
598 | // Conversion to/from FP16/FP16x2 is always legal. |
599 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2f16, Action: Custom); |
600 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2f16, Action: Custom); |
601 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2f16, Action: Expand); |
602 | setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2f16, Action: Expand); |
603 | |
604 | setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal); |
605 | if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31) |
606 | setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal); |
607 | |
608 | setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); |
609 | setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); |
610 | |
611 | // Conversion to/from BFP16/BFP16x2 is always legal. |
612 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2bf16, Action: Custom); |
613 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2bf16, Action: Custom); |
614 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2bf16, Action: Expand); |
615 | setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2bf16, Action: Expand); |
616 | |
617 | setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); |
618 | setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); |
619 | if (getOperationAction(Op: ISD::SETCC, VT: MVT::bf16) == Promote) |
620 | AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::bf16, DestVT: MVT::f32); |
621 | |
622 | // Conversion to/from i16/i16x2 is always legal. |
623 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16, Action: Custom); |
624 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v2i16, Action: Custom); |
625 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v2i16, Action: Expand); |
626 | setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v2i16, Action: Expand); |
627 | |
628 | setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v4i8, Action: Custom); |
629 | setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: MVT::v4i8, Action: Custom); |
630 | setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: MVT::v4i8, Action: Custom); |
631 | setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT: MVT::v4i8, Action: Custom); |
632 | |
633 | // Custom conversions to/from v2i8. |
634 | setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i8, Action: Custom); |
635 | |
636 | // Only logical ops can be done on v4i8 directly, others must be done |
637 | // elementwise. |
638 | setOperationAction( |
639 | Ops: {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, |
640 | ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ, |
641 | ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR, |
642 | ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY, |
643 | ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY, |
644 | ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC, |
645 | ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX, |
646 | ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA, |
647 | ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO, |
648 | ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC, |
649 | ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT, |
650 | ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX, |
651 | ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, |
652 | ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, |
653 | ISD::USUBSAT}, |
654 | VT: MVT::v4i8, Action: Expand); |
655 | |
656 | // Operations not directly supported by NVPTX. |
657 | for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, |
658 | MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, |
659 | MVT::i32, MVT::i64}) { |
660 | setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand); |
661 | setOperationAction(Op: ISD::BR_CC, VT, Action: Expand); |
662 | } |
663 | |
664 | // Some SIGN_EXTEND_INREG can be done using cvt instruction. |
665 | // For others we will expand to a SHL/SRA pair. |
666 | setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i64, Action: Legal); |
667 | setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i32, Action: Legal); |
668 | setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i16, Action: Legal); |
669 | setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i8 , Action: Legal); |
670 | setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::i1, Action: Expand); |
671 | setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT: MVT::v2i16, Action: Expand); |
672 | |
673 | setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i32 , Action: Custom); |
674 | setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i32 , Action: Custom); |
675 | setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i32 , Action: Custom); |
676 | setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i64 , Action: Custom); |
677 | setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i64 , Action: Custom); |
678 | setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i64 , Action: Custom); |
679 | |
680 | setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i32, Action: Legal); |
681 | setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i64, Action: Legal); |
682 | |
683 | setOperationAction(Ops: {ISD::ROTL, ISD::ROTR}, |
684 | VTs: {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64}, |
685 | Action: Expand); |
686 | |
687 | if (STI.hasHWROT32()) { |
688 | setOperationAction(Ops: {ISD::FSHL, ISD::FSHR}, VT: MVT::i32, Action: Legal); |
689 | setOperationAction(Ops: {ISD::ROTL, ISD::ROTR, ISD::FSHL, ISD::FSHR}, VT: MVT::i64, |
690 | Action: Custom); |
691 | } |
692 | |
693 | setOperationAction(Op: ISD::BSWAP, VT: MVT::i16, Action: Expand); |
694 | |
695 | setOperationAction(Op: ISD::BR_JT, VT: MVT::Other, Action: Custom); |
696 | setOperationAction(Op: ISD::BRIND, VT: MVT::Other, Action: Expand); |
697 | |
698 | // We want to legalize constant related memmove and memcopy |
699 | // intrinsics. |
700 | setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::Other, Action: Custom); |
701 | |
702 | // Turn FP extload into load/fpextend |
703 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand); |
704 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand); |
705 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand); |
706 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand); |
707 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand); |
708 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand); |
709 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand); |
710 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand); |
711 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand); |
712 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand); |
713 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand); |
714 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand); |
715 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand); |
716 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand); |
717 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand); |
718 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand); |
719 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand); |
720 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand); |
721 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand); |
722 | // Turn FP truncstore into trunc + store. |
723 | // FIXME: vector types should also be expanded |
724 | setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand); |
725 | setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand); |
726 | setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand); |
727 | setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand); |
728 | setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand); |
729 | |
730 | // PTX does not support load / store predicate registers |
731 | setOperationAction(Op: ISD::LOAD, VT: MVT::i1, Action: Custom); |
732 | setOperationAction(Op: ISD::STORE, VT: MVT::i1, Action: Custom); |
733 | |
734 | for (MVT VT : MVT::integer_valuetypes()) { |
735 | setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote); |
736 | setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote); |
737 | setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Promote); |
738 | setTruncStoreAction(ValVT: VT, MemVT: MVT::i1, Action: Expand); |
739 | } |
740 | |
741 | setCondCodeAction(CCs: {ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE, |
742 | ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT, |
743 | ISD::SETGE, ISD::SETLE}, |
744 | VT: MVT::i1, Action: Expand); |
745 | |
746 | // expand extload of vector of integers. |
747 | setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::v2i16, |
748 | MemVT: MVT::v2i8, Action: Expand); |
749 | setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand); |
750 | |
751 | // This is legal in NVPTX |
752 | setOperationAction(Op: ISD::ConstantFP, VT: MVT::f64, Action: Legal); |
753 | setOperationAction(Op: ISD::ConstantFP, VT: MVT::f32, Action: Legal); |
754 | setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal); |
755 | setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal); |
756 | |
757 | setOperationAction(Ops: ISD::DYNAMIC_STACKALLOC, VTs: {MVT::i32, MVT::i64}, Action: Custom); |
758 | setOperationAction(Ops: {ISD::STACKRESTORE, ISD::STACKSAVE}, VT: MVT::Other, Action: Custom); |
759 | |
760 | // TRAP can be lowered to PTX trap |
761 | setOperationAction(Op: ISD::TRAP, VT: MVT::Other, Action: Legal); |
762 | // DEBUGTRAP can be lowered to PTX brkpt |
763 | setOperationAction(Op: ISD::DEBUGTRAP, VT: MVT::Other, Action: Legal); |
764 | |
765 | // Register custom handling for vector loads/stores |
766 | for (MVT VT : MVT::fixedlen_vector_valuetypes()) |
767 | if (IsPTXVectorType(VT)) |
768 | setOperationAction(Ops: {ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT, |
769 | Action: Custom); |
770 | |
771 | setOperationAction(Ops: {ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, |
772 | VTs: {MVT::i128, MVT::f128}, Action: Custom); |
773 | |
774 | // Support varargs. |
775 | setOperationAction(Op: ISD::VASTART, VT: MVT::Other, Action: Custom); |
776 | setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Custom); |
777 | setOperationAction(Op: ISD::VACOPY, VT: MVT::Other, Action: Expand); |
778 | setOperationAction(Op: ISD::VAEND, VT: MVT::Other, Action: Expand); |
779 | |
780 | // Custom handling for i8 intrinsics |
781 | setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::i8, Action: Custom); |
782 | |
783 | setOperationAction(Ops: {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, |
784 | VTs: {MVT::i16, MVT::i32, MVT::i64}, Action: Legal); |
785 | |
786 | setOperationAction(Ops: {ISD::CTPOP, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i16, |
787 | Action: Promote); |
788 | setOperationAction(Ops: {ISD::CTPOP, ISD::CTLZ}, VT: MVT::i32, Action: Legal); |
789 | setOperationAction(Ops: {ISD::CTPOP, ISD::CTLZ}, VT: MVT::i64, Action: Custom); |
790 | |
791 | setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); |
792 | setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); |
793 | setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom); |
794 | setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom); |
795 | setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom); |
796 | setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); |
797 | setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); |
798 | |
799 | setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); |
800 | setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); |
801 | setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); |
802 | setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); |
803 | setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); |
804 | setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom); |
805 | |
806 | // Other arithmetic and logic ops are unsupported. |
807 | setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, |
808 | ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, |
809 | ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::SETCC}, |
810 | VT: MVT::v2i16, Action: Expand); |
811 | |
812 | setOperationAction(Op: ISD::ADDC, VT: MVT::i32, Action: Legal); |
813 | setOperationAction(Op: ISD::ADDE, VT: MVT::i32, Action: Legal); |
814 | setOperationAction(Op: ISD::SUBC, VT: MVT::i32, Action: Legal); |
815 | setOperationAction(Op: ISD::SUBE, VT: MVT::i32, Action: Legal); |
816 | if (STI.getPTXVersion() >= 43) { |
817 | setOperationAction(Op: ISD::ADDC, VT: MVT::i64, Action: Legal); |
818 | setOperationAction(Op: ISD::ADDE, VT: MVT::i64, Action: Legal); |
819 | setOperationAction(Op: ISD::SUBC, VT: MVT::i64, Action: Legal); |
820 | setOperationAction(Op: ISD::SUBE, VT: MVT::i64, Action: Legal); |
821 | } |
822 | |
823 | setOperationAction(Op: ISD::CTTZ, VT: MVT::i16, Action: Expand); |
824 | setOperationAction(Op: ISD::CTTZ, VT: MVT::v2i16, Action: Expand); |
825 | setOperationAction(Op: ISD::CTTZ, VT: MVT::i32, Action: Expand); |
826 | setOperationAction(Op: ISD::CTTZ, VT: MVT::i64, Action: Expand); |
827 | |
828 | // PTX does not directly support SELP of i1, so promote to i32 first |
829 | setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Custom); |
830 | |
831 | // PTX cannot multiply two i64s in a single instruction. |
832 | setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i64, Action: Expand); |
833 | setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i64, Action: Expand); |
834 | |
835 | // We have some custom DAG combine patterns for these nodes |
836 | setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, |
837 | ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT, |
838 | ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD, |
839 | ISD::STORE}); |
840 | |
841 | // setcc for f16x2 and bf16x2 needs special handling to prevent |
842 | // legalizer's attempt to scalarize it due to v2i1 not being legal. |
843 | if (STI.allowFP16Math() || STI.hasBF16Math()) |
844 | setTargetDAGCombine(ISD::SETCC); |
845 | |
846 | // Promote fp16 arithmetic if fp16 hardware isn't available or the |
847 | // user passed --nvptx-no-fp16-math. The flag is useful because, |
848 | // although sm_53+ GPUs have some sort of FP16 support in |
849 | // hardware, only sm_53 and sm_60 have full implementation. Others |
850 | // only have token amount of hardware and are likely to run faster |
851 | // by using fp32 units instead. |
852 | for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { |
853 | setFP16OperationAction(Op, MVT::f16, Legal, Promote); |
854 | setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); |
855 | setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); |
856 | // bf16 must be promoted to f32. |
857 | setBF16OperationAction(Op, MVT::bf16, Legal, Promote); |
858 | if (getOperationAction(Op, VT: MVT::bf16) == Promote) |
859 | AddPromotedToType(Opc: Op, OrigVT: MVT::bf16, DestVT: MVT::f32); |
860 | } |
861 | |
862 | // On SM80, we select add/mul/sub as fma to avoid promotion to float |
863 | for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) { |
864 | for (const auto &VT : {MVT::bf16, MVT::v2bf16}) { |
865 | if (!STI.hasNativeBF16Support(Opcode: Op) && STI.hasNativeBF16Support(Opcode: ISD::FMA)) { |
866 | setOperationAction(Op, VT, Action: Custom); |
867 | } |
868 | } |
869 | } |
870 | |
871 | // f16/f16x2 neg was introduced in PTX 60, SM_53. |
872 | const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && |
873 | STI.getPTXVersion() >= 60 && |
874 | STI.allowFP16Math(); |
875 | for (const auto &VT : {MVT::f16, MVT::v2f16}) |
876 | setOperationAction(Op: ISD::FNEG, VT, |
877 | Action: IsFP16FP16x2NegAvailable ? Legal : Expand); |
878 | |
879 | setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); |
880 | setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); |
881 | // (would be) Library functions. |
882 | |
883 | // These map to conversion instructions for scalar FP types. |
884 | for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, |
885 | ISD::FROUNDEVEN, ISD::FTRUNC}) { |
886 | setOperationAction(Op, VT: MVT::f16, Action: Legal); |
887 | setOperationAction(Op, VT: MVT::f32, Action: Legal); |
888 | setOperationAction(Op, VT: MVT::f64, Action: Legal); |
889 | setOperationAction(Op, VT: MVT::v2f16, Action: Expand); |
890 | setOperationAction(Op, VT: MVT::v2bf16, Action: Expand); |
891 | setBF16OperationAction(Op, MVT::bf16, Legal, Promote); |
892 | if (getOperationAction(Op, VT: MVT::bf16) == Promote) |
893 | AddPromotedToType(Opc: Op, OrigVT: MVT::bf16, DestVT: MVT::f32); |
894 | } |
895 | |
896 | if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) { |
897 | setOperationAction(Op: ISD::BF16_TO_FP, VT: MVT::f32, Action: Expand); |
898 | } |
899 | if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { |
900 | for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) { |
901 | setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Custom); |
902 | setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom); |
903 | } |
904 | } |
905 | |
906 | // sm_80 only has conversions between f32 and bf16. Custom lower all other |
907 | // bf16 conversions. |
908 | if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { |
909 | for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) { |
910 | setOperationAction( |
911 | Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, |
912 | VT, Action: Custom); |
913 | } |
914 | setOperationAction( |
915 | Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, |
916 | VT: MVT::bf16, Action: Custom); |
917 | } |
918 | |
919 | setOperationAction(Op: ISD::FROUND, VT: MVT::f16, Action: Promote); |
920 | setOperationAction(Op: ISD::FROUND, VT: MVT::v2f16, Action: Expand); |
921 | setOperationAction(Op: ISD::FROUND, VT: MVT::v2bf16, Action: Expand); |
922 | setOperationAction(Op: ISD::FROUND, VT: MVT::f32, Action: Custom); |
923 | setOperationAction(Op: ISD::FROUND, VT: MVT::f64, Action: Custom); |
924 | setOperationAction(Op: ISD::FROUND, VT: MVT::bf16, Action: Promote); |
925 | AddPromotedToType(Opc: ISD::FROUND, OrigVT: MVT::bf16, DestVT: MVT::f32); |
926 | |
927 | // 'Expand' implements FCOPYSIGN without calling an external library. |
928 | setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Expand); |
929 | setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::v2f16, Action: Expand); |
930 | setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Expand); |
931 | setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::v2bf16, Action: Expand); |
932 | setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Custom); |
933 | setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Custom); |
934 | |
935 | // These map to corresponding instructions for f32/f64. f16 must be |
936 | // promoted to f32. v2f16 is expanded to f16, which is then promoted |
937 | // to f32. |
938 | for (const auto &Op : |
939 | {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { |
940 | setOperationAction(Op, VT: MVT::f16, Action: Promote); |
941 | setOperationAction(Op, VT: MVT::f32, Action: Legal); |
942 | setOperationAction(Op, VT: MVT::f64, Action: Legal); |
943 | setOperationAction(Op, VT: MVT::v2f16, Action: Expand); |
944 | setOperationAction(Op, VT: MVT::v2bf16, Action: Expand); |
945 | setOperationAction(Op, VT: MVT::bf16, Action: Promote); |
946 | AddPromotedToType(Opc: Op, OrigVT: MVT::bf16, DestVT: MVT::f32); |
947 | } |
948 | setOperationAction(Ops: ISD::FREM, VTs: {MVT::f32, MVT::f64}, Action: Custom); |
949 | |
950 | setOperationAction(Ops: ISD::FABS, VTs: {MVT::f32, MVT::f64}, Action: Legal); |
951 | if (STI.getPTXVersion() >= 65) { |
952 | setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote); |
953 | setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand); |
954 | } else { |
955 | setOperationAction(Op: ISD::FABS, VT: MVT::f16, Action: Promote); |
956 | setOperationAction(Op: ISD::FABS, VT: MVT::v2f16, Action: Expand); |
957 | } |
958 | setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand); |
959 | setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote); |
960 | if (getOperationAction(Op: ISD::FABS, VT: MVT::bf16) == Promote) |
961 | AddPromotedToType(Opc: ISD::FABS, OrigVT: MVT::bf16, DestVT: MVT::f32); |
962 | |
963 | for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { |
964 | setOperationAction(Op, VT: MVT::f32, Action: Legal); |
965 | setOperationAction(Op, VT: MVT::f64, Action: Legal); |
966 | setFP16OperationAction(Op, MVT::f16, Legal, Promote); |
967 | setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); |
968 | setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); |
969 | setBF16OperationAction(Op, MVT::bf16, Legal, Promote); |
970 | if (getOperationAction(Op, VT: MVT::bf16) == Promote) |
971 | AddPromotedToType(Opc: Op, OrigVT: MVT::bf16, DestVT: MVT::f32); |
972 | } |
973 | bool SupportsF32MinMaxNaN = |
974 | STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; |
975 | for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { |
976 | setOperationAction(Op, VT: MVT::f32, Action: SupportsF32MinMaxNaN ? Legal : Expand); |
977 | setFP16OperationAction(Op, MVT::f16, Legal, Expand); |
978 | setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); |
979 | setBF16OperationAction(Op, MVT::bf16, Legal, Expand); |
980 | setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); |
981 | } |
982 | |
983 | // Custom lowering for inline asm with 128-bit operands |
984 | setOperationAction(Op: ISD::CopyToReg, VT: MVT::i128, Action: Custom); |
985 | setOperationAction(Op: ISD::CopyFromReg, VT: MVT::i128, Action: Custom); |
986 | |
987 | // FEXP2 support: |
988 | // - f32 |
989 | // - f16/f16x2 (sm_70+, PTX 7.0+) |
990 | // - bf16/bf16x2 (sm_90+, PTX 7.8+) |
991 | // When f16/bf16 types aren't supported, they are promoted/expanded to f32. |
992 | setOperationAction(Op: ISD::FEXP2, VT: MVT::f32, Action: Legal); |
993 | setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote); |
994 | setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand); |
995 | setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote); |
996 | setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand); |
997 | |
998 | // FLOG2 supports f32 only |
999 | // f16/bf16 types aren't supported, but they are promoted/expanded to f32. |
1000 | if (UseApproxLog2F32) { |
1001 | setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Legal); |
1002 | setOperationPromotedToType(Opc: ISD::FLOG2, OrigVT: MVT::f16, DestVT: MVT::f32); |
1003 | setOperationPromotedToType(Opc: ISD::FLOG2, OrigVT: MVT::bf16, DestVT: MVT::f32); |
1004 | setOperationAction(Ops: ISD::FLOG2, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Expand); |
1005 | } |
1006 | |
1007 | setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom); |
1008 | |
1009 | setOperationAction(Ops: ISD::ATOMIC_LOAD_SUB, VTs: {MVT::i32, MVT::i64}, Action: Expand); |
1010 | // No FPOW or FREM in PTX. |
1011 | |
1012 | // Now deduce the information based on the above mentioned |
1013 | // actions |
1014 | computeRegisterProperties(TRI: STI.getRegisterInfo()); |
1015 | |
1016 | // PTX support for 16-bit CAS is emulated. Only use 32+ |
1017 | setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits()); |
1018 | setMaxAtomicSizeInBitsSupported(64); |
1019 | setMaxDivRemBitWidthSupported(64); |
1020 | |
1021 | // Custom lowering for tcgen05.ld vector operands |
1022 | setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN, |
1023 | VTs: {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, |
1024 | MVT::v32i32, MVT::v64i32, MVT::v128i32}, |
1025 | Action: Custom); |
1026 | |
1027 | // Custom lowering for tcgen05.st vector operands |
1028 | setOperationAction(Ops: ISD::INTRINSIC_VOID, |
1029 | VTs: {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, |
1030 | MVT::v32i32, MVT::v64i32, MVT::v128i32}, |
1031 | Action: Custom); |
1032 | |
1033 | setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::Other, Action: Custom); |
1034 | // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol |
1035 | setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i128, Action: Custom); |
1036 | } |
1037 | |
1038 | const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { |
1039 | |
1040 | #define MAKE_CASE(V) \ |
1041 | case V: \ |
1042 | return #V; |
1043 | |
1044 | switch ((NVPTXISD::NodeType)Opcode) { |
1045 | case NVPTXISD::FIRST_NUMBER: |
1046 | break; |
1047 | |
1048 | MAKE_CASE(NVPTXISD::RET_GLUE) |
1049 | MAKE_CASE(NVPTXISD::DeclareArrayParam) |
1050 | MAKE_CASE(NVPTXISD::DeclareScalarParam) |
1051 | MAKE_CASE(NVPTXISD::CALL) |
1052 | MAKE_CASE(NVPTXISD::LoadParam) |
1053 | MAKE_CASE(NVPTXISD::LoadParamV2) |
1054 | MAKE_CASE(NVPTXISD::LoadParamV4) |
1055 | MAKE_CASE(NVPTXISD::StoreParam) |
1056 | MAKE_CASE(NVPTXISD::StoreParamV2) |
1057 | MAKE_CASE(NVPTXISD::StoreParamV4) |
1058 | MAKE_CASE(NVPTXISD::MoveParam) |
1059 | MAKE_CASE(NVPTXISD::UNPACK_VECTOR) |
1060 | MAKE_CASE(NVPTXISD::BUILD_VECTOR) |
1061 | MAKE_CASE(NVPTXISD::CallPrototype) |
1062 | MAKE_CASE(NVPTXISD::ProxyReg) |
1063 | MAKE_CASE(NVPTXISD::LoadV2) |
1064 | MAKE_CASE(NVPTXISD::LoadV4) |
1065 | MAKE_CASE(NVPTXISD::LoadV8) |
1066 | MAKE_CASE(NVPTXISD::LDUV2) |
1067 | MAKE_CASE(NVPTXISD::LDUV4) |
1068 | MAKE_CASE(NVPTXISD::StoreV2) |
1069 | MAKE_CASE(NVPTXISD::StoreV4) |
1070 | MAKE_CASE(NVPTXISD::StoreV8) |
1071 | MAKE_CASE(NVPTXISD::FSHL_CLAMP) |
1072 | MAKE_CASE(NVPTXISD::FSHR_CLAMP) |
1073 | MAKE_CASE(NVPTXISD::BFE) |
1074 | MAKE_CASE(NVPTXISD::BFI) |
1075 | MAKE_CASE(NVPTXISD::PRMT) |
1076 | MAKE_CASE(NVPTXISD::FCOPYSIGN) |
1077 | MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) |
1078 | MAKE_CASE(NVPTXISD::STACKRESTORE) |
1079 | MAKE_CASE(NVPTXISD::STACKSAVE) |
1080 | MAKE_CASE(NVPTXISD::SETP_F16X2) |
1081 | MAKE_CASE(NVPTXISD::SETP_BF16X2) |
1082 | MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED) |
1083 | MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED) |
1084 | MAKE_CASE(NVPTXISD::BrxEnd) |
1085 | MAKE_CASE(NVPTXISD::BrxItem) |
1086 | MAKE_CASE(NVPTXISD::BrxStart) |
1087 | MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED) |
1088 | MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X) |
1089 | MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y) |
1090 | MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z) |
1091 | } |
1092 | return nullptr; |
1093 | |
1094 | #undef MAKE_CASE |
1095 | } |
1096 | |
1097 | TargetLoweringBase::LegalizeTypeAction |
1098 | NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { |
1099 | if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && |
1100 | VT.getScalarType() == MVT::i1) |
1101 | return TypeSplitVector; |
1102 | return TargetLoweringBase::getPreferredVectorAction(VT); |
1103 | } |
1104 | |
1105 | SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, |
1106 | int Enabled, int &, |
1107 | bool &UseOneConst, |
1108 | bool Reciprocal) const { |
1109 | if (!(Enabled == ReciprocalEstimate::Enabled || |
1110 | (Enabled == ReciprocalEstimate::Unspecified && |
1111 | !usePrecSqrtF32(MF: DAG.getMachineFunction())))) |
1112 | return SDValue(); |
1113 | |
1114 | if (ExtraSteps == ReciprocalEstimate::Unspecified) |
1115 | ExtraSteps = 0; |
1116 | |
1117 | SDLoc DL(Operand); |
1118 | EVT VT = Operand.getValueType(); |
1119 | bool Ftz = useF32FTZ(MF: DAG.getMachineFunction()); |
1120 | |
1121 | auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { |
1122 | return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, |
1123 | N1: DAG.getConstant(Val: IID, DL, VT: MVT::i32), N2: Operand); |
1124 | }; |
1125 | |
1126 | // The sqrt and rsqrt refinement processes assume we always start out with an |
1127 | // approximation of the rsqrt. Therefore, if we're going to do any refinement |
1128 | // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing |
1129 | // any refinement, we must return a regular sqrt. |
1130 | if (Reciprocal || ExtraSteps > 0) { |
1131 | if (VT == MVT::f32) |
1132 | return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f |
1133 | : Intrinsic::nvvm_rsqrt_approx_f); |
1134 | else if (VT == MVT::f64) |
1135 | return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); |
1136 | else |
1137 | return SDValue(); |
1138 | } else { |
1139 | if (VT == MVT::f32) |
1140 | return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f |
1141 | : Intrinsic::nvvm_sqrt_approx_f); |
1142 | else { |
1143 | // There's no sqrt.approx.f64 instruction, so we emit |
1144 | // reciprocal(rsqrt(x)). This is faster than |
1145 | // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain |
1146 | // x * rsqrt(x).) |
1147 | return DAG.getNode( |
1148 | Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, |
1149 | N1: DAG.getConstant(Val: Intrinsic::nvvm_rcp_approx_ftz_d, DL, VT: MVT::i32), |
1150 | N2: MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); |
1151 | } |
1152 | } |
1153 | } |
1154 | |
1155 | std::string NVPTXTargetLowering::getPrototype( |
1156 | const DataLayout &DL, Type *RetTy, const ArgListTy &Args, |
1157 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
1158 | std::optional<unsigned> FirstVAArg, const CallBase &CB, |
1159 | unsigned UniqueCallSite) const { |
1160 | auto PtrVT = getPointerTy(DL); |
1161 | |
1162 | std::string Prototype; |
1163 | raw_string_ostream O(Prototype); |
1164 | O << "prototype_" << UniqueCallSite << " : .callprototype " ; |
1165 | |
1166 | if (RetTy->isVoidTy()) { |
1167 | O << "()" ; |
1168 | } else { |
1169 | O << "(" ; |
1170 | if (shouldPassAsArray(Ty: RetTy)) { |
1171 | const Align RetAlign = getArgumentAlignment(CB: &CB, Ty: RetTy, Idx: 0, DL); |
1172 | O << ".param .align " << RetAlign.value() << " .b8 _[" |
1173 | << DL.getTypeAllocSize(Ty: RetTy) << "]" ; |
1174 | } else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) { |
1175 | unsigned size = 0; |
1176 | if (auto *ITy = dyn_cast<IntegerType>(Val: RetTy)) { |
1177 | size = ITy->getBitWidth(); |
1178 | } else { |
1179 | assert(RetTy->isFloatingPointTy() && |
1180 | "Floating point type expected here" ); |
1181 | size = RetTy->getPrimitiveSizeInBits(); |
1182 | } |
1183 | // PTX ABI requires all scalar return values to be at least 32 |
1184 | // bits in size. fp16 normally uses .b16 as its storage type in |
1185 | // PTX, so its size must be adjusted here, too. |
1186 | size = promoteScalarArgumentSize(size); |
1187 | |
1188 | O << ".param .b" << size << " _" ; |
1189 | } else if (isa<PointerType>(Val: RetTy)) { |
1190 | O << ".param .b" << PtrVT.getSizeInBits() << " _" ; |
1191 | } else { |
1192 | llvm_unreachable("Unknown return type" ); |
1193 | } |
1194 | O << ") " ; |
1195 | } |
1196 | O << "_ (" ; |
1197 | |
1198 | bool first = true; |
1199 | |
1200 | const unsigned NumArgs = FirstVAArg.value_or(u: Args.size()); |
1201 | auto AllOuts = ArrayRef(Outs); |
1202 | for (const unsigned I : llvm::seq(Size: NumArgs)) { |
1203 | const auto ArgOuts = |
1204 | AllOuts.take_while(Pred: [I](auto O) { return O.OrigArgIndex == I; }); |
1205 | AllOuts = AllOuts.drop_front(N: ArgOuts.size()); |
1206 | |
1207 | Type *Ty = Args[I].Ty; |
1208 | if (!first) { |
1209 | O << ", " ; |
1210 | } |
1211 | first = false; |
1212 | |
1213 | if (ArgOuts[0].Flags.isByVal()) { |
1214 | // Indirect calls need strict ABI alignment so we disable optimizations by |
1215 | // not providing a function to optimize. |
1216 | Type *ETy = Args[I].IndirectType; |
1217 | Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign(); |
1218 | Align ParamByValAlign = |
1219 | getFunctionByValParamAlign(/*F=*/nullptr, ArgTy: ETy, InitialAlign, DL); |
1220 | |
1221 | O << ".param .align " << ParamByValAlign.value() << " .b8 _[" |
1222 | << ArgOuts[0].Flags.getByValSize() << "]" ; |
1223 | } else { |
1224 | if (shouldPassAsArray(Ty)) { |
1225 | Align ParamAlign = |
1226 | getArgumentAlignment(CB: &CB, Ty, Idx: I + AttributeList::FirstArgIndex, DL); |
1227 | O << ".param .align " << ParamAlign.value() << " .b8 _[" |
1228 | << DL.getTypeAllocSize(Ty) << "]" ; |
1229 | continue; |
1230 | } |
1231 | // i8 types in IR will be i16 types in SDAG |
1232 | assert((getValueType(DL, Ty) == ArgOuts[0].VT || |
1233 | (getValueType(DL, Ty) == MVT::i8 && ArgOuts[0].VT == MVT::i16)) && |
1234 | "type mismatch between callee prototype and arguments" ); |
1235 | // scalar type |
1236 | unsigned sz = 0; |
1237 | if (auto *ITy = dyn_cast<IntegerType>(Val: Ty)) { |
1238 | sz = promoteScalarArgumentSize(size: ITy->getBitWidth()); |
1239 | } else if (isa<PointerType>(Val: Ty)) { |
1240 | sz = PtrVT.getSizeInBits(); |
1241 | } else { |
1242 | sz = Ty->getPrimitiveSizeInBits(); |
1243 | } |
1244 | O << ".param .b" << sz << " _" ; |
1245 | } |
1246 | } |
1247 | |
1248 | if (FirstVAArg) |
1249 | O << (first ? "" : "," ) << " .param .align " |
1250 | << STI.getMaxRequiredAlignment() << " .b8 _[]" ; |
1251 | O << ")" ; |
1252 | if (shouldEmitPTXNoReturn(V: &CB, TM: *nvTM)) |
1253 | O << " .noreturn" ; |
1254 | O << ";" ; |
1255 | |
1256 | return Prototype; |
1257 | } |
1258 | |
1259 | Align NVPTXTargetLowering::getFunctionArgumentAlignment( |
1260 | const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const { |
1261 | return getAlign(F: *F, Index: Idx).value_or(u: getFunctionParamOptimizedAlign(F, ArgTy: Ty, DL)); |
1262 | } |
1263 | |
1264 | Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, |
1265 | unsigned Idx, |
1266 | const DataLayout &DL) const { |
1267 | if (!CB) { |
1268 | // CallSite is zero, fallback to ABI type alignment |
1269 | return DL.getABITypeAlign(Ty); |
1270 | } |
1271 | |
1272 | const Function *DirectCallee = CB->getCalledFunction(); |
1273 | |
1274 | if (!DirectCallee) { |
1275 | // We don't have a direct function symbol, but that may be because of |
1276 | // constant cast instructions in the call. |
1277 | |
1278 | // With bitcast'd call targets, the instruction will be the call |
1279 | if (const auto *CI = dyn_cast<CallInst>(Val: CB)) { |
1280 | // Check if we have call alignment metadata |
1281 | if (MaybeAlign StackAlign = getAlign(*CI, Idx)) |
1282 | return StackAlign.value(); |
1283 | } |
1284 | DirectCallee = getMaybeBitcastedCallee(CB); |
1285 | } |
1286 | |
1287 | // Check for function alignment information if we found that the |
1288 | // ultimate target is a Function |
1289 | if (DirectCallee) |
1290 | return getFunctionArgumentAlignment(F: DirectCallee, Ty, Idx, DL); |
1291 | |
1292 | // Call is indirect, fall back to the ABI type alignment |
1293 | return DL.getABITypeAlign(Ty); |
1294 | } |
1295 | |
1296 | static bool adjustElementType(EVT &ElementType) { |
1297 | switch (ElementType.getSimpleVT().SimpleTy) { |
1298 | default: |
1299 | return false; |
1300 | case MVT::f16: |
1301 | case MVT::bf16: |
1302 | ElementType = MVT::i16; |
1303 | return true; |
1304 | case MVT::f32: |
1305 | case MVT::v2f16: |
1306 | case MVT::v2bf16: |
1307 | ElementType = MVT::i32; |
1308 | return true; |
1309 | case MVT::f64: |
1310 | ElementType = MVT::i64; |
1311 | return true; |
1312 | } |
1313 | } |
1314 | |
1315 | // Use byte-store when the param address of the argument value is unaligned. |
1316 | // This may happen when the return value is a field of a packed structure. |
1317 | // |
1318 | // This is called in LowerCall() when passing the param values. |
1319 | static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, |
1320 | uint64_t Offset, EVT ElementType, |
1321 | SDValue StVal, SDValue &InGlue, |
1322 | unsigned ArgID, const SDLoc &dl) { |
1323 | // Bit logic only works on integer types |
1324 | if (adjustElementType(ElementType)) |
1325 | StVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ElementType, Operand: StVal); |
1326 | |
1327 | // Store each byte |
1328 | SDVTList StoreVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
1329 | for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { |
1330 | // Shift the byte to the last byte position |
1331 | SDValue ShiftVal = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: ElementType, N1: StVal, |
1332 | N2: DAG.getConstant(Val: i * 8, DL: dl, VT: MVT::i32)); |
1333 | SDValue StoreOperands[] = {Chain, DAG.getConstant(Val: ArgID, DL: dl, VT: MVT::i32), |
1334 | DAG.getConstant(Val: Offset + i, DL: dl, VT: MVT::i32), |
1335 | ShiftVal, InGlue}; |
1336 | // Trunc store only the last byte by using |
1337 | // st.param.b8 |
1338 | // The register type can be larger than b8. |
1339 | Chain = DAG.getMemIntrinsicNode( |
1340 | Opcode: NVPTXISD::StoreParam, dl, VTList: StoreVTs, Ops: StoreOperands, MemVT: MVT::i8, |
1341 | PtrInfo: MachinePointerInfo(), Alignment: Align(1), Flags: MachineMemOperand::MOStore); |
1342 | InGlue = Chain.getValue(R: 1); |
1343 | } |
1344 | return Chain; |
1345 | } |
1346 | |
1347 | // Use byte-load when the param adress of the returned value is unaligned. |
1348 | // This may happen when the returned value is a field of a packed structure. |
1349 | static SDValue |
1350 | LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, |
1351 | EVT ElementType, SDValue &InGlue, |
1352 | SmallVectorImpl<SDValue> &TempProxyRegOps, |
1353 | const SDLoc &dl) { |
1354 | // Bit logic only works on integer types |
1355 | EVT MergedType = ElementType; |
1356 | adjustElementType(ElementType&: MergedType); |
1357 | |
1358 | // Load each byte and construct the whole value. Initial value to 0 |
1359 | SDValue RetVal = DAG.getConstant(Val: 0, DL: dl, VT: MergedType); |
1360 | // LoadParamMemI8 loads into i16 register only |
1361 | SDVTList LoadVTs = DAG.getVTList(VT1: MVT::i16, VT2: MVT::Other, VT3: MVT::Glue); |
1362 | for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { |
1363 | SDValue LoadOperands[] = {Chain, DAG.getConstant(Val: 1, DL: dl, VT: MVT::i32), |
1364 | DAG.getConstant(Val: Offset + i, DL: dl, VT: MVT::i32), |
1365 | InGlue}; |
1366 | // This will be selected to LoadParamMemI8 |
1367 | SDValue LdVal = |
1368 | DAG.getMemIntrinsicNode(Opcode: NVPTXISD::LoadParam, dl, VTList: LoadVTs, Ops: LoadOperands, |
1369 | MemVT: MVT::i8, PtrInfo: MachinePointerInfo(), Alignment: Align(1)); |
1370 | SDValue TmpLdVal = LdVal.getValue(R: 0); |
1371 | Chain = LdVal.getValue(R: 1); |
1372 | InGlue = LdVal.getValue(R: 2); |
1373 | |
1374 | TmpLdVal = DAG.getNode(Opcode: NVPTXISD::ProxyReg, DL: dl, |
1375 | VT: TmpLdVal.getSimpleValueType(), Operand: TmpLdVal); |
1376 | TempProxyRegOps.push_back(Elt: TmpLdVal); |
1377 | |
1378 | SDValue CMask = DAG.getConstant(Val: 255, DL: dl, VT: MergedType); |
1379 | SDValue CShift = DAG.getConstant(Val: i * 8, DL: dl, VT: MVT::i32); |
1380 | // Need to extend the i16 register to the whole width. |
1381 | TmpLdVal = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MergedType, Operand: TmpLdVal); |
1382 | // Mask off the high bits. Leave only the lower 8bits. |
1383 | // Do this because we are using loadparam.b8. |
1384 | TmpLdVal = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MergedType, N1: TmpLdVal, N2: CMask); |
1385 | // Shift and merge |
1386 | TmpLdVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MergedType, N1: TmpLdVal, N2: CShift); |
1387 | RetVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MergedType, N1: RetVal, N2: TmpLdVal); |
1388 | } |
1389 | if (ElementType != MergedType) |
1390 | RetVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ElementType, Operand: RetVal); |
1391 | |
1392 | return RetVal; |
1393 | } |
1394 | |
1395 | static bool shouldConvertToIndirectCall(const CallBase *CB, |
1396 | const GlobalAddressSDNode *Func) { |
1397 | if (!Func) |
1398 | return false; |
1399 | if (auto *CalleeFunc = dyn_cast<Function>(Val: Func->getGlobal())) |
1400 | return CB->getFunctionType() != CalleeFunc->getFunctionType(); |
1401 | return false; |
1402 | } |
1403 | |
1404 | static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, |
1405 | const DataLayout &DL, |
1406 | const TargetLowering &TL) { |
1407 | if (Ptr->getOpcode() == ISD::FrameIndex) { |
1408 | auto Ty = TL.getPointerTy(DL, AS: ADDRESS_SPACE_LOCAL); |
1409 | Ptr = DAG.getAddrSpaceCast(dl: SDLoc(), VT: Ty, Ptr, SrcAS: ADDRESS_SPACE_GENERIC, |
1410 | DestAS: ADDRESS_SPACE_LOCAL); |
1411 | |
1412 | return MachinePointerInfo(ADDRESS_SPACE_LOCAL); |
1413 | } |
1414 | |
1415 | // Peel of an addrspacecast to generic and load directly from the specific |
1416 | // address space. |
1417 | if (Ptr->getOpcode() == ISD::ADDRSPACECAST) { |
1418 | const auto *ASC = cast<AddrSpaceCastSDNode>(Val&: Ptr); |
1419 | if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) { |
1420 | Ptr = ASC->getOperand(Num: 0); |
1421 | return MachinePointerInfo(ASC->getSrcAddressSpace()); |
1422 | } |
1423 | } |
1424 | |
1425 | return MachinePointerInfo(); |
1426 | } |
1427 | |
1428 | static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags) { |
1429 | if (Flags.isSExt()) |
1430 | return ISD::SIGN_EXTEND; |
1431 | if (Flags.isZExt()) |
1432 | return ISD::ZERO_EXTEND; |
1433 | return ISD::ANY_EXTEND; |
1434 | } |
1435 | |
1436 | static SDValue correctParamType(SDValue V, EVT ExpectedVT, |
1437 | ISD::ArgFlagsTy Flags, SelectionDAG &DAG, |
1438 | SDLoc dl) { |
1439 | const EVT ActualVT = V.getValueType(); |
1440 | assert((ActualVT == ExpectedVT || |
1441 | (ExpectedVT.isInteger() && ActualVT.isInteger())) && |
1442 | "Non-integer argument type size mismatch" ); |
1443 | if (ExpectedVT.bitsGT(VT: ActualVT)) |
1444 | return DAG.getNode(Opcode: getExtOpcode(Flags), DL: dl, VT: ExpectedVT, Operand: V); |
1445 | if (ExpectedVT.bitsLT(VT: ActualVT)) |
1446 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ExpectedVT, Operand: V); |
1447 | |
1448 | return V; |
1449 | } |
1450 | |
1451 | SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, |
1452 | SmallVectorImpl<SDValue> &InVals) const { |
1453 | |
1454 | if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) |
1455 | report_fatal_error( |
1456 | reason: "Support for variadic functions (unsized array parameter) introduced " |
1457 | "in PTX ISA version 6.0 and requires target sm_30." ); |
1458 | |
1459 | SelectionDAG &DAG = CLI.DAG; |
1460 | SDLoc dl = CLI.DL; |
1461 | SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; |
1462 | SDValue Chain = CLI.Chain; |
1463 | SDValue Callee = CLI.Callee; |
1464 | bool &isTailCall = CLI.IsTailCall; |
1465 | ArgListTy &Args = CLI.getArgs(); |
1466 | Type *RetTy = CLI.RetTy; |
1467 | const CallBase *CB = CLI.CB; |
1468 | const DataLayout &DL = DAG.getDataLayout(); |
1469 | |
1470 | const auto GetI32 = [&](const unsigned I) { |
1471 | return DAG.getConstant(Val: I, DL: dl, VT: MVT::i32); |
1472 | }; |
1473 | |
1474 | // Variadic arguments. |
1475 | // |
1476 | // Normally, for each argument, we declare a param scalar or a param |
1477 | // byte array in the .param space, and store the argument value to that |
1478 | // param scalar or array starting at offset 0. |
1479 | // |
1480 | // In the case of the first variadic argument, we declare a vararg byte array |
1481 | // with size 0. The exact size of this array isn't known at this point, so |
1482 | // it'll be patched later. All the variadic arguments will be stored to this |
1483 | // array at a certain offset (which gets tracked by 'VAOffset'). The offset is |
1484 | // initially set to 0, so it can be used for non-variadic arguments (which use |
1485 | // 0 offset) to simplify the code. |
1486 | // |
1487 | // After all vararg is processed, 'VAOffset' holds the size of the |
1488 | // vararg byte array. |
1489 | |
1490 | SDValue VADeclareParam; // vararg byte array |
1491 | const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic |
1492 | unsigned VAOffset = 0; // current offset in the param array |
1493 | |
1494 | const unsigned UniqueCallSite = GlobalUniqueCallSite++; |
1495 | SDValue TempChain = Chain; |
1496 | Chain = DAG.getCALLSEQ_START(Chain, InSize: UniqueCallSite, OutSize: 0, DL: dl); |
1497 | SDValue InGlue = Chain.getValue(R: 1); |
1498 | |
1499 | // Args.size() and Outs.size() need not match. |
1500 | // Outs.size() will be larger |
1501 | // * if there is an aggregate argument with multiple fields (each field |
1502 | // showing up separately in Outs) |
1503 | // * if there is a vector argument with more than typical vector-length |
1504 | // elements (generally if more than 4) where each vector element is |
1505 | // individually present in Outs. |
1506 | // So a different index should be used for indexing into Outs/OutVals. |
1507 | // See similar issue in LowerFormalArguments. |
1508 | auto AllOuts = ArrayRef(CLI.Outs); |
1509 | auto AllOutVals = ArrayRef(CLI.OutVals); |
1510 | assert(AllOuts.size() == AllOutVals.size() && |
1511 | "Outs and OutVals must be the same size" ); |
1512 | // Declare the .params or .reg need to pass values |
1513 | // to the function |
1514 | for (const auto E : llvm::enumerate(First&: Args)) { |
1515 | const auto ArgI = E.index(); |
1516 | const auto Arg = E.value(); |
1517 | const auto ArgOuts = |
1518 | AllOuts.take_while(Pred: [&](auto O) { return O.OrigArgIndex == ArgI; }); |
1519 | const auto ArgOutVals = AllOutVals.take_front(N: ArgOuts.size()); |
1520 | AllOuts = AllOuts.drop_front(N: ArgOuts.size()); |
1521 | AllOutVals = AllOutVals.drop_front(N: ArgOuts.size()); |
1522 | |
1523 | const bool IsVAArg = (ArgI >= FirstVAArg); |
1524 | const bool IsByVal = Arg.IsByVal; |
1525 | |
1526 | const SDValue ParamSymbol = |
1527 | getCallParamSymbol(DAG, I: IsVAArg ? FirstVAArg : ArgI, T: MVT::i32); |
1528 | |
1529 | SmallVector<EVT, 16> VTs; |
1530 | SmallVector<uint64_t, 16> Offsets; |
1531 | |
1532 | assert((!IsByVal || Arg.IndirectType) && |
1533 | "byval arg must have indirect type" ); |
1534 | Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty); |
1535 | ComputePTXValueVTs(TLI: *this, DL, Ty: ETy, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: IsByVal ? 0 : VAOffset); |
1536 | assert(VTs.size() == Offsets.size() && "Size mismatch" ); |
1537 | assert((IsByVal || VTs.size() == ArgOuts.size()) && "Size mismatch" ); |
1538 | |
1539 | const Align ArgAlign = [&]() { |
1540 | if (IsByVal) { |
1541 | // The ByValAlign in the Outs[OIdx].Flags is always set at this point, |
1542 | // so we don't need to worry whether it's naturally aligned or not. |
1543 | // See TargetLowering::LowerCallTo(). |
1544 | const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign(); |
1545 | const Align ByValAlign = getFunctionByValParamAlign( |
1546 | F: CB->getCalledFunction(), ArgTy: ETy, InitialAlign, DL); |
1547 | if (IsVAArg) |
1548 | VAOffset = alignTo(Size: VAOffset, A: ByValAlign); |
1549 | return ByValAlign; |
1550 | } |
1551 | return getArgumentAlignment(CB, Ty: Arg.Ty, Idx: ArgI + 1, DL); |
1552 | }(); |
1553 | |
1554 | const unsigned TypeSize = DL.getTypeAllocSize(Ty: ETy); |
1555 | assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) && |
1556 | "type size mismatch" ); |
1557 | |
1558 | const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> { |
1559 | if (IsVAArg) { |
1560 | if (ArgI == FirstVAArg) { |
1561 | VADeclareParam = DAG.getNode( |
1562 | Opcode: NVPTXISD::DeclareArrayParam, DL: dl, ResultTys: {MVT::Other, MVT::Glue}, |
1563 | Ops: {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()), |
1564 | GetI32(0), InGlue}); |
1565 | return VADeclareParam; |
1566 | } |
1567 | return std::nullopt; |
1568 | } |
1569 | if (IsByVal || shouldPassAsArray(Ty: Arg.Ty)) { |
1570 | // declare .param .align <align> .b8 .param<n>[<size>]; |
1571 | return DAG.getNode(Opcode: NVPTXISD::DeclareArrayParam, DL: dl, |
1572 | ResultTys: {MVT::Other, MVT::Glue}, |
1573 | Ops: {Chain, ParamSymbol, GetI32(ArgAlign.value()), |
1574 | GetI32(TypeSize), InGlue}); |
1575 | } |
1576 | assert(ArgOuts.size() == 1 && "We must pass only one value as non-array" ); |
1577 | // declare .param .b<size> .param<n>; |
1578 | |
1579 | // PTX ABI requires integral types to be at least 32 bits in |
1580 | // size. FP16 is loaded/stored using i16, so it's handled |
1581 | // here as well. |
1582 | const unsigned PromotedSize = |
1583 | (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) |
1584 | ? promoteScalarArgumentSize(size: TypeSize * 8) |
1585 | : TypeSize * 8; |
1586 | |
1587 | return DAG.getNode(Opcode: NVPTXISD::DeclareScalarParam, DL: dl, |
1588 | ResultTys: {MVT::Other, MVT::Glue}, |
1589 | Ops: {Chain, ParamSymbol, GetI32(PromotedSize), InGlue}); |
1590 | }(); |
1591 | if (ArgDeclare) { |
1592 | Chain = ArgDeclare->getValue(R: 0); |
1593 | InGlue = ArgDeclare->getValue(R: 1); |
1594 | } |
1595 | |
1596 | // PTX Interoperability Guide 3.3(A): [Integer] Values shorter |
1597 | // than 32-bits are sign extended or zero extended, depending on |
1598 | // whether they are signed or unsigned types. This case applies |
1599 | // only to scalar parameters and not to aggregate values. |
1600 | const bool ExtendIntegerParam = |
1601 | Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty: Arg.Ty) < 32; |
1602 | |
1603 | const auto GetStoredValue = [&](const unsigned I, EVT EltVT, |
1604 | const Align PartAlign) { |
1605 | SDValue StVal; |
1606 | if (IsByVal) { |
1607 | SDValue Ptr = ArgOutVals[0]; |
1608 | auto MPI = refinePtrAS(Ptr, DAG, DL, TL: *this); |
1609 | SDValue SrcAddr = |
1610 | DAG.getObjectPtrOffset(SL: dl, Ptr, Offset: TypeSize::getFixed(ExactSize: Offsets[I])); |
1611 | |
1612 | StVal = DAG.getLoad(VT: EltVT, dl, Chain: TempChain, Ptr: SrcAddr, PtrInfo: MPI, Alignment: PartAlign); |
1613 | } else { |
1614 | StVal = ArgOutVals[I]; |
1615 | |
1616 | auto PromotedVT = promoteScalarIntegerPTX(VT: StVal.getValueType()); |
1617 | if (PromotedVT != StVal.getValueType()) { |
1618 | StVal = DAG.getNode(Opcode: getExtOpcode(Flags: ArgOuts[I].Flags), DL: dl, VT: PromotedVT, |
1619 | Operand: StVal); |
1620 | } |
1621 | } |
1622 | |
1623 | if (ExtendIntegerParam) { |
1624 | assert(VTs.size() == 1 && "Scalar can't have multiple parts." ); |
1625 | // zext/sext to i32 |
1626 | StVal = |
1627 | DAG.getNode(Opcode: getExtOpcode(Flags: ArgOuts[I].Flags), DL: dl, VT: MVT::i32, Operand: StVal); |
1628 | } else if (EltVT.getSizeInBits() < 16) { |
1629 | // Use 16-bit registers for small stores as it's the |
1630 | // smallest general purpose register size supported by NVPTX. |
1631 | StVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i16, Operand: StVal); |
1632 | } |
1633 | return StVal; |
1634 | }; |
1635 | |
1636 | const auto VectorInfo = |
1637 | VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: ArgAlign, IsVAArg); |
1638 | |
1639 | unsigned J = 0; |
1640 | for (const unsigned NumElts : VectorInfo) { |
1641 | const int CurOffset = Offsets[J]; |
1642 | EVT EltVT = promoteScalarIntegerPTX(VT: VTs[J]); |
1643 | const Align PartAlign = commonAlignment(A: ArgAlign, Offset: CurOffset); |
1644 | |
1645 | // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a |
1646 | // scalar store. In such cases, fall back to byte stores. |
1647 | if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(MemoryVT: EltVT)) { |
1648 | |
1649 | SDValue StVal = GetStoredValue(J, EltVT, PartAlign); |
1650 | Chain = LowerUnalignedStoreParam(DAG, Chain, |
1651 | Offset: CurOffset + (IsByVal ? VAOffset : 0), |
1652 | ElementType: EltVT, StVal, InGlue, ArgID: ArgI, dl); |
1653 | |
1654 | // LowerUnalignedStoreParam took care of inserting the necessary nodes |
1655 | // into the SDAG, so just move on to the next element. |
1656 | J++; |
1657 | continue; |
1658 | } |
1659 | |
1660 | if (IsVAArg && !IsByVal) |
1661 | // Align each part of the variadic argument to their type. |
1662 | VAOffset = alignTo(Size: VAOffset, A: DAG.getEVTAlign(MemoryVT: EltVT)); |
1663 | |
1664 | assert((IsVAArg || VAOffset == 0) && |
1665 | "VAOffset must be 0 for non-VA args" ); |
1666 | SmallVector<SDValue, 6> StoreOperands{ |
1667 | Chain, GetI32(IsVAArg ? FirstVAArg : ArgI), |
1668 | GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))}; |
1669 | |
1670 | // Record the values to store. |
1671 | for (const unsigned K : llvm::seq(Size: NumElts)) |
1672 | StoreOperands.push_back(Elt: GetStoredValue(J + K, EltVT, PartAlign)); |
1673 | StoreOperands.push_back(Elt: InGlue); |
1674 | |
1675 | NVPTXISD::NodeType Op; |
1676 | switch (NumElts) { |
1677 | case 1: |
1678 | Op = NVPTXISD::StoreParam; |
1679 | break; |
1680 | case 2: |
1681 | Op = NVPTXISD::StoreParamV2; |
1682 | break; |
1683 | case 4: |
1684 | Op = NVPTXISD::StoreParamV4; |
1685 | break; |
1686 | default: |
1687 | llvm_unreachable("Invalid vector info." ); |
1688 | } |
1689 | // Adjust type of the store op if we've extended the scalar |
1690 | // return value. |
1691 | EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; |
1692 | |
1693 | Chain = DAG.getMemIntrinsicNode( |
1694 | Opcode: Op, dl, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops: StoreOperands, |
1695 | MemVT: TheStoreType, PtrInfo: MachinePointerInfo(), Alignment: PartAlign, |
1696 | Flags: MachineMemOperand::MOStore); |
1697 | InGlue = Chain.getValue(R: 1); |
1698 | |
1699 | // TODO: We may need to support vector types that can be passed |
1700 | // as scalars in variadic arguments. |
1701 | if (IsVAArg && !IsByVal) { |
1702 | assert(NumElts == 1 && |
1703 | "Vectorization is expected to be disabled for variadics." ); |
1704 | VAOffset += |
1705 | DL.getTypeAllocSize(Ty: TheStoreType.getTypeForEVT(Context&: *DAG.getContext())); |
1706 | } |
1707 | |
1708 | J += NumElts; |
1709 | } |
1710 | if (IsVAArg && IsByVal) |
1711 | VAOffset += TypeSize; |
1712 | } |
1713 | |
1714 | GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Val: Callee.getNode()); |
1715 | |
1716 | // Handle Result |
1717 | if (!Ins.empty()) { |
1718 | const SDValue RetDeclare = [&]() { |
1719 | const SDValue RetSymbol = DAG.getExternalSymbol(Sym: "retval0" , VT: MVT::i32); |
1720 | const unsigned ResultSize = DL.getTypeAllocSizeInBits(Ty: RetTy); |
1721 | if (shouldPassAsArray(Ty: RetTy)) { |
1722 | const Align RetAlign = getArgumentAlignment(CB, Ty: RetTy, Idx: 0, DL); |
1723 | return DAG.getNode(Opcode: NVPTXISD::DeclareArrayParam, DL: dl, |
1724 | ResultTys: {MVT::Other, MVT::Glue}, |
1725 | Ops: {Chain, RetSymbol, GetI32(RetAlign.value()), |
1726 | GetI32(ResultSize / 8), InGlue}); |
1727 | } |
1728 | const auto PromotedResultSize = promoteScalarArgumentSize(size: ResultSize); |
1729 | return DAG.getNode( |
1730 | Opcode: NVPTXISD::DeclareScalarParam, DL: dl, ResultTys: {MVT::Other, MVT::Glue}, |
1731 | Ops: {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue}); |
1732 | }(); |
1733 | Chain = RetDeclare.getValue(R: 0); |
1734 | InGlue = RetDeclare.getValue(R: 1); |
1735 | } |
1736 | |
1737 | const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); |
1738 | // Set the size of the vararg param byte array if the callee is a variadic |
1739 | // function and the variadic part is not empty. |
1740 | if (HasVAArgs) { |
1741 | SDValue DeclareParamOps[] = {VADeclareParam.getOperand(i: 0), |
1742 | VADeclareParam.getOperand(i: 1), |
1743 | VADeclareParam.getOperand(i: 2), GetI32(VAOffset), |
1744 | VADeclareParam.getOperand(i: 4)}; |
1745 | DAG.MorphNodeTo(N: VADeclareParam.getNode(), Opc: VADeclareParam.getOpcode(), |
1746 | VTs: VADeclareParam->getVTList(), Ops: DeclareParamOps); |
1747 | } |
1748 | |
1749 | // If the type of the callsite does not match that of the function, convert |
1750 | // the callsite to an indirect call. |
1751 | const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func); |
1752 | |
1753 | // Both indirect calls and libcalls have nullptr Func. In order to distinguish |
1754 | // between them we must rely on the call site value which is valid for |
1755 | // indirect calls but is always null for libcalls. |
1756 | const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall; |
1757 | |
1758 | if (isa<ExternalSymbolSDNode>(Val: Callee)) { |
1759 | Function* CalleeFunc = nullptr; |
1760 | |
1761 | // Try to find the callee in the current module. |
1762 | Callee = DAG.getSymbolFunctionGlobalAddress(Op: Callee, TargetFunction: &CalleeFunc); |
1763 | assert(CalleeFunc != nullptr && "Libcall callee must be set." ); |
1764 | |
1765 | // Set the "libcall callee" attribute to indicate that the function |
1766 | // must always have a declaration. |
1767 | CalleeFunc->addFnAttr(Kind: "nvptx-libcall-callee" , Val: "true" ); |
1768 | } |
1769 | |
1770 | if (IsIndirectCall) { |
1771 | // This is indirect function call case : PTX requires a prototype of the |
1772 | // form |
1773 | // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); |
1774 | // to be emitted, and the label has to used as the last arg of call |
1775 | // instruction. |
1776 | // The prototype is embedded in a string and put as the operand for a |
1777 | // CallPrototype SDNode which will print out to the value of the string. |
1778 | std::string Proto = |
1779 | getPrototype(DL, RetTy, Args, Outs: CLI.Outs, |
1780 | FirstVAArg: HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, CB: *CB, |
1781 | UniqueCallSite); |
1782 | const char *ProtoStr = nvTM->getStrPool().save(S: Proto).data(); |
1783 | Chain = DAG.getNode( |
1784 | Opcode: NVPTXISD::CallPrototype, DL: dl, ResultTys: {MVT::Other, MVT::Glue}, |
1785 | Ops: {Chain, DAG.getTargetExternalSymbol(Sym: ProtoStr, VT: MVT::i32), InGlue}); |
1786 | InGlue = Chain.getValue(R: 1); |
1787 | } |
1788 | |
1789 | if (ConvertToIndirectCall) { |
1790 | // Copy the function ptr to a ptx register and use the register to call the |
1791 | // function. |
1792 | const MVT DestVT = Callee.getValueType().getSimpleVT(); |
1793 | MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); |
1794 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); |
1795 | Register DestReg = MRI.createVirtualRegister(RegClass: TLI.getRegClassFor(VT: DestVT)); |
1796 | auto RegCopy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl, Reg: DestReg, N: Callee); |
1797 | Callee = DAG.getCopyFromReg(Chain: RegCopy, dl, Reg: DestReg, VT: DestVT); |
1798 | } |
1799 | |
1800 | const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0; |
1801 | const unsigned NumArgs = |
1802 | std::min<unsigned>(a: CLI.NumFixedArgs + 1, b: Args.size()); |
1803 | /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, |
1804 | /// NumParams, Callee, Proto, InGlue) |
1805 | Chain = DAG.getNode(Opcode: NVPTXISD::CALL, DL: dl, ResultTys: {MVT::Other, MVT::Glue}, |
1806 | Ops: {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), |
1807 | GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, |
1808 | GetI32(Proto), InGlue}); |
1809 | InGlue = Chain.getValue(R: 1); |
1810 | |
1811 | SmallVector<SDValue, 16> ProxyRegOps; |
1812 | // An item of the vector is filled if the element does not need a ProxyReg |
1813 | // operation on it and should be added to InVals as is. ProxyRegOps and |
1814 | // ProxyRegTruncates contain empty/none items at the same index. |
1815 | SmallVector<SDValue, 16> RetElts; |
1816 | // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` |
1817 | // to use the values of `LoadParam`s and to be replaced later then |
1818 | // `CALLSEQ_END` is added. |
1819 | SmallVector<SDValue, 16> TempProxyRegOps; |
1820 | |
1821 | // Generate loads from param memory/moves from registers for result |
1822 | if (!Ins.empty()) { |
1823 | SmallVector<EVT, 16> VTs; |
1824 | SmallVector<uint64_t, 16> Offsets; |
1825 | ComputePTXValueVTs(TLI: *this, DL, Ty: RetTy, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: 0); |
1826 | assert(VTs.size() == Ins.size() && "Bad value decomposition" ); |
1827 | |
1828 | const Align RetAlign = getArgumentAlignment(CB, Ty: RetTy, Idx: 0, DL); |
1829 | |
1830 | // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than |
1831 | // 32-bits are sign extended or zero extended, depending on whether |
1832 | // they are signed or unsigned types. |
1833 | const bool ExtendIntegerRetVal = |
1834 | RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty: RetTy) < 32; |
1835 | |
1836 | const auto VectorInfo = VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: RetAlign); |
1837 | unsigned I = 0; |
1838 | for (const unsigned VectorizedSize : VectorInfo) { |
1839 | EVT TheLoadType = promoteScalarIntegerPTX(VT: VTs[I]); |
1840 | EVT EltType = Ins[I].VT; |
1841 | const Align EltAlign = commonAlignment(A: RetAlign, Offset: Offsets[I]); |
1842 | |
1843 | if (TheLoadType != VTs[I]) |
1844 | EltType = TheLoadType; |
1845 | |
1846 | if (ExtendIntegerRetVal) { |
1847 | TheLoadType = MVT::i32; |
1848 | EltType = MVT::i32; |
1849 | } else if (TheLoadType.getSizeInBits() < 16) { |
1850 | EltType = MVT::i16; |
1851 | } |
1852 | |
1853 | // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a |
1854 | // scalar load. In such cases, fall back to byte loads. |
1855 | if (VectorizedSize == 1 && RetTy->isAggregateType() && |
1856 | EltAlign < DAG.getEVTAlign(MemoryVT: TheLoadType)) { |
1857 | SDValue Ret = LowerUnalignedLoadRetParam( |
1858 | DAG, Chain, Offset: Offsets[I], ElementType: TheLoadType, InGlue, TempProxyRegOps, dl); |
1859 | ProxyRegOps.push_back(Elt: SDValue()); |
1860 | RetElts.resize(N: I); |
1861 | RetElts.push_back(Elt: Ret); |
1862 | |
1863 | I++; |
1864 | continue; |
1865 | } |
1866 | |
1867 | SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType); |
1868 | LoadVTs.append(IL: {MVT::Other, MVT::Glue}); |
1869 | |
1870 | NVPTXISD::NodeType Op; |
1871 | switch (VectorizedSize) { |
1872 | case 1: |
1873 | Op = NVPTXISD::LoadParam; |
1874 | break; |
1875 | case 2: |
1876 | Op = NVPTXISD::LoadParamV2; |
1877 | break; |
1878 | case 4: |
1879 | Op = NVPTXISD::LoadParamV4; |
1880 | break; |
1881 | default: |
1882 | llvm_unreachable("Invalid vector info." ); |
1883 | } |
1884 | |
1885 | SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue}; |
1886 | SDValue RetVal = DAG.getMemIntrinsicNode( |
1887 | Opcode: Op, dl, VTList: DAG.getVTList(VTs: LoadVTs), Ops: LoadOperands, MemVT: TheLoadType, |
1888 | PtrInfo: MachinePointerInfo(), Alignment: EltAlign, Flags: MachineMemOperand::MOLoad); |
1889 | |
1890 | for (const unsigned J : llvm::seq(Size: VectorizedSize)) { |
1891 | ProxyRegOps.push_back(Elt: RetVal.getValue(R: J)); |
1892 | } |
1893 | |
1894 | Chain = RetVal.getValue(R: VectorizedSize); |
1895 | InGlue = RetVal.getValue(R: VectorizedSize + 1); |
1896 | |
1897 | I += VectorizedSize; |
1898 | } |
1899 | } |
1900 | |
1901 | Chain = |
1902 | DAG.getCALLSEQ_END(Chain, Size1: UniqueCallSite, Size2: UniqueCallSite + 1, Glue: InGlue, DL: dl); |
1903 | InGlue = Chain.getValue(R: 1); |
1904 | |
1905 | // Append ProxyReg instructions to the chain to make sure that `callseq_end` |
1906 | // will not get lost. Otherwise, during libcalls expansion, the nodes can become |
1907 | // dangling. |
1908 | for (const unsigned I : llvm::seq(Size: ProxyRegOps.size())) { |
1909 | if (I < RetElts.size() && RetElts[I]) { |
1910 | InVals.push_back(Elt: RetElts[I]); |
1911 | continue; |
1912 | } |
1913 | |
1914 | SDValue Ret = |
1915 | DAG.getNode(Opcode: NVPTXISD::ProxyReg, DL: dl, VT: ProxyRegOps[I].getSimpleValueType(), |
1916 | Ops: {Chain, ProxyRegOps[I]}); |
1917 | |
1918 | const EVT ExpectedVT = Ins[I].VT; |
1919 | if (!Ret.getValueType().bitsEq(VT: ExpectedVT)) { |
1920 | Ret = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: ExpectedVT, Operand: Ret); |
1921 | } |
1922 | InVals.push_back(Elt: Ret); |
1923 | } |
1924 | |
1925 | for (SDValue &T : TempProxyRegOps) { |
1926 | SDValue Repl = DAG.getNode(Opcode: NVPTXISD::ProxyReg, DL: dl, VT: T.getSimpleValueType(), |
1927 | Ops: {Chain, T.getOperand(i: 0)}); |
1928 | DAG.ReplaceAllUsesWith(From: T, To: Repl); |
1929 | DAG.RemoveDeadNode(N: T.getNode()); |
1930 | } |
1931 | |
1932 | // set isTailCall to false for now, until we figure out how to express |
1933 | // tail call optimization in PTX |
1934 | isTailCall = false; |
1935 | return Chain; |
1936 | } |
1937 | |
1938 | SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, |
1939 | SelectionDAG &DAG) const { |
1940 | |
1941 | if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) { |
1942 | const Function &Fn = DAG.getMachineFunction().getFunction(); |
1943 | |
1944 | DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported( |
1945 | Fn, |
1946 | "Support for dynamic alloca introduced in PTX ISA version 7.3 and " |
1947 | "requires target sm_52." , |
1948 | SDLoc(Op).getDebugLoc())); |
1949 | auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), |
1950 | Op.getOperand(i: 0)}; |
1951 | return DAG.getMergeValues(Ops, dl: SDLoc()); |
1952 | } |
1953 | |
1954 | SDLoc DL(Op.getNode()); |
1955 | SDValue Chain = Op.getOperand(i: 0); |
1956 | SDValue Size = Op.getOperand(i: 1); |
1957 | uint64_t Align = Op.getConstantOperandVal(i: 2); |
1958 | |
1959 | // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that |
1960 | // the default stack alignment should be used. |
1961 | if (Align == 0) |
1962 | Align = DAG.getSubtarget().getFrameLowering()->getStackAlign().value(); |
1963 | |
1964 | // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32. |
1965 | const MVT LocalVT = getPointerTy(DL: DAG.getDataLayout(), AS: ADDRESS_SPACE_LOCAL); |
1966 | |
1967 | SDValue Alloc = |
1968 | DAG.getNode(Opcode: NVPTXISD::DYNAMIC_STACKALLOC, DL, ResultTys: {LocalVT, MVT::Other}, |
1969 | Ops: {Chain, DAG.getZExtOrTrunc(Op: Size, DL, VT: LocalVT), |
1970 | DAG.getTargetConstant(Val: Align, DL, VT: MVT::i32)}); |
1971 | |
1972 | SDValue ASC = DAG.getAddrSpaceCast( |
1973 | dl: DL, VT: Op.getValueType(), Ptr: Alloc, SrcAS: ADDRESS_SPACE_LOCAL, DestAS: ADDRESS_SPACE_GENERIC); |
1974 | |
1975 | return DAG.getMergeValues(Ops: {ASC, SDValue(Alloc.getNode(), 1)}, dl: DL); |
1976 | } |
1977 | |
1978 | SDValue NVPTXTargetLowering::LowerSTACKRESTORE(SDValue Op, |
1979 | SelectionDAG &DAG) const { |
1980 | SDLoc DL(Op.getNode()); |
1981 | if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) { |
1982 | const Function &Fn = DAG.getMachineFunction().getFunction(); |
1983 | |
1984 | DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported( |
1985 | Fn, |
1986 | "Support for stackrestore requires PTX ISA version >= 7.3 and target " |
1987 | ">= sm_52." , |
1988 | DL.getDebugLoc())); |
1989 | return Op.getOperand(i: 0); |
1990 | } |
1991 | |
1992 | const MVT LocalVT = getPointerTy(DL: DAG.getDataLayout(), AS: ADDRESS_SPACE_LOCAL); |
1993 | SDValue Chain = Op.getOperand(i: 0); |
1994 | SDValue Ptr = Op.getOperand(i: 1); |
1995 | SDValue ASC = DAG.getAddrSpaceCast(dl: DL, VT: LocalVT, Ptr, SrcAS: ADDRESS_SPACE_GENERIC, |
1996 | DestAS: ADDRESS_SPACE_LOCAL); |
1997 | return DAG.getNode(Opcode: NVPTXISD::STACKRESTORE, DL, VT: MVT::Other, Ops: {Chain, ASC}); |
1998 | } |
1999 | |
2000 | SDValue NVPTXTargetLowering::LowerSTACKSAVE(SDValue Op, |
2001 | SelectionDAG &DAG) const { |
2002 | SDLoc DL(Op.getNode()); |
2003 | if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) { |
2004 | const Function &Fn = DAG.getMachineFunction().getFunction(); |
2005 | |
2006 | DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported( |
2007 | Fn, |
2008 | "Support for stacksave requires PTX ISA version >= 7.3 and target >= " |
2009 | "sm_52." , |
2010 | DL.getDebugLoc())); |
2011 | auto Ops = {DAG.getConstant(Val: 0, DL, VT: Op.getValueType()), Op.getOperand(i: 0)}; |
2012 | return DAG.getMergeValues(Ops, dl: DL); |
2013 | } |
2014 | |
2015 | const MVT LocalVT = getPointerTy(DL: DAG.getDataLayout(), AS: ADDRESS_SPACE_LOCAL); |
2016 | SDValue Chain = Op.getOperand(i: 0); |
2017 | SDValue SS = |
2018 | DAG.getNode(Opcode: NVPTXISD::STACKSAVE, DL, ResultTys: {LocalVT, MVT::Other}, Ops: Chain); |
2019 | SDValue ASC = DAG.getAddrSpaceCast( |
2020 | dl: DL, VT: Op.getValueType(), Ptr: SS, SrcAS: ADDRESS_SPACE_LOCAL, DestAS: ADDRESS_SPACE_GENERIC); |
2021 | return DAG.getMergeValues(Ops: {ASC, SDValue(SS.getNode(), 1)}, dl: DL); |
2022 | } |
2023 | |
2024 | // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() |
2025 | // (see LegalizeDAG.cpp). This is slow and uses local memory. |
2026 | // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 |
2027 | SDValue |
2028 | NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { |
2029 | SDNode *Node = Op.getNode(); |
2030 | SDLoc dl(Node); |
2031 | SmallVector<SDValue, 8> Ops; |
2032 | unsigned NumOperands = Node->getNumOperands(); |
2033 | for (unsigned i = 0; i < NumOperands; ++i) { |
2034 | SDValue SubOp = Node->getOperand(Num: i); |
2035 | EVT VVT = SubOp.getNode()->getValueType(ResNo: 0); |
2036 | EVT EltVT = VVT.getVectorElementType(); |
2037 | unsigned NumSubElem = VVT.getVectorNumElements(); |
2038 | for (unsigned j = 0; j < NumSubElem; ++j) { |
2039 | Ops.push_back(Elt: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: SubOp, |
2040 | N2: DAG.getIntPtrConstant(Val: j, DL: dl))); |
2041 | } |
2042 | } |
2043 | return DAG.getBuildVector(VT: Node->getValueType(ResNo: 0), DL: dl, Ops); |
2044 | } |
2045 | |
2046 | SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { |
2047 | // Handle bitcasting from v2i8 without hitting the default promotion |
2048 | // strategy which goes through stack memory. |
2049 | EVT FromVT = Op->getOperand(Num: 0)->getValueType(ResNo: 0); |
2050 | if (FromVT != MVT::v2i8) { |
2051 | return Op; |
2052 | } |
2053 | |
2054 | // Pack vector elements into i16 and bitcast to final type |
2055 | SDLoc DL(Op); |
2056 | SDValue Vec0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, |
2057 | N1: Op->getOperand(Num: 0), N2: DAG.getIntPtrConstant(Val: 0, DL)); |
2058 | SDValue Vec1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, |
2059 | N1: Op->getOperand(Num: 0), N2: DAG.getIntPtrConstant(Val: 1, DL)); |
2060 | SDValue Extend0 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i16, Operand: Vec0); |
2061 | SDValue Extend1 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i16, Operand: Vec1); |
2062 | SDValue Const8 = DAG.getConstant(Val: 8, DL, VT: MVT::i16); |
2063 | SDValue AsInt = DAG.getNode( |
2064 | Opcode: ISD::OR, DL, VT: MVT::i16, |
2065 | Ops: {Extend0, DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i16, Ops: {Extend1, Const8})}); |
2066 | EVT ToVT = Op->getValueType(ResNo: 0); |
2067 | return DAG.getBitcast(VT: ToVT, V: AsInt); |
2068 | } |
2069 | |
2070 | // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it |
2071 | // would get lowered as two constant loads and vector-packing move. |
2072 | // Instead we want just a constant move: |
2073 | // mov.b32 %r2, 0x40003C00 |
2074 | SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, |
2075 | SelectionDAG &DAG) const { |
2076 | EVT VT = Op->getValueType(ResNo: 0); |
2077 | if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) |
2078 | return Op; |
2079 | SDLoc DL(Op); |
2080 | |
2081 | if (!llvm::all_of(Range: Op->ops(), P: [](SDValue Operand) { |
2082 | return Operand->isUndef() || isa<ConstantSDNode>(Val: Operand) || |
2083 | isa<ConstantFPSDNode>(Val: Operand); |
2084 | })) { |
2085 | if (VT != MVT::v4i8) |
2086 | return Op; |
2087 | // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us |
2088 | // to optimize calculation of constant parts. |
2089 | auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast, |
2090 | uint64_t SelectionValue) -> SDValue { |
2091 | SDValue L = Left; |
2092 | SDValue R = Right; |
2093 | if (Cast) { |
2094 | L = DAG.getAnyExtOrTrunc(Op: L, DL, VT: MVT::i32); |
2095 | R = DAG.getAnyExtOrTrunc(Op: R, DL, VT: MVT::i32); |
2096 | } |
2097 | return DAG.getNode( |
2098 | Opcode: NVPTXISD::PRMT, DL, VT: MVT::v4i8, |
2099 | Ops: {L, R, DAG.getConstant(Val: SelectionValue, DL, VT: MVT::i32), |
2100 | DAG.getConstant(Val: NVPTX::PTXPrmtMode::NONE, DL, VT: MVT::i32)}); |
2101 | }; |
2102 | auto PRMT__10 = GetPRMT(Op->getOperand(Num: 0), Op->getOperand(Num: 1), true, 0x3340); |
2103 | auto PRMT__32 = GetPRMT(Op->getOperand(Num: 2), Op->getOperand(Num: 3), true, 0x3340); |
2104 | auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410); |
2105 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: PRMT3210); |
2106 | } |
2107 | |
2108 | // Get value or the Nth operand as an APInt(32). Undef values treated as 0. |
2109 | auto GetOperand = [](SDValue Op, int N) -> APInt { |
2110 | const SDValue &Operand = Op->getOperand(Num: N); |
2111 | EVT VT = Op->getValueType(ResNo: 0); |
2112 | if (Operand->isUndef()) |
2113 | return APInt(32, 0); |
2114 | APInt Value; |
2115 | if (VT == MVT::v2f16 || VT == MVT::v2bf16) |
2116 | Value = cast<ConstantFPSDNode>(Val: Operand)->getValueAPF().bitcastToAPInt(); |
2117 | else if (VT == MVT::v2i16 || VT == MVT::v4i8) |
2118 | Value = Operand->getAsAPIntVal(); |
2119 | else |
2120 | llvm_unreachable("Unsupported type" ); |
2121 | // i8 values are carried around as i16, so we need to zero out upper bits, |
2122 | // so they do not get in the way of combining individual byte values |
2123 | if (VT == MVT::v4i8) |
2124 | Value = Value.trunc(width: 8); |
2125 | return Value.zext(width: 32); |
2126 | }; |
2127 | APInt Value; |
2128 | if (Isv2x16VT(VT)) { |
2129 | Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(shiftAmt: 16); |
2130 | } else if (VT == MVT::v4i8) { |
2131 | Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(shiftAmt: 8) | |
2132 | GetOperand(Op, 2).shl(shiftAmt: 16) | GetOperand(Op, 3).shl(shiftAmt: 24); |
2133 | } else { |
2134 | llvm_unreachable("Unsupported type" ); |
2135 | } |
2136 | SDValue Const = DAG.getConstant(Val: Value, DL, VT: MVT::i32); |
2137 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op->getValueType(ResNo: 0), Operand: Const); |
2138 | } |
2139 | |
2140 | SDValue NVPTXTargetLowering::(SDValue Op, |
2141 | SelectionDAG &DAG) const { |
2142 | SDValue Index = Op->getOperand(Num: 1); |
2143 | SDValue Vector = Op->getOperand(Num: 0); |
2144 | SDLoc DL(Op); |
2145 | EVT VectorVT = Vector.getValueType(); |
2146 | |
2147 | if (VectorVT == MVT::v4i8) { |
2148 | SDValue BFE = |
2149 | DAG.getNode(Opcode: NVPTXISD::BFE, DL, VT: MVT::i32, |
2150 | Ops: {Vector, |
2151 | DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i32, |
2152 | N1: DAG.getZExtOrTrunc(Op: Index, DL, VT: MVT::i32), |
2153 | N2: DAG.getConstant(Val: 8, DL, VT: MVT::i32)), |
2154 | DAG.getConstant(Val: 8, DL, VT: MVT::i32)}); |
2155 | return DAG.getAnyExtOrTrunc(Op: BFE, DL, VT: Op->getValueType(ResNo: 0)); |
2156 | } |
2157 | |
2158 | // Constant index will be matched by tablegen. |
2159 | if (isa<ConstantSDNode>(Val: Index.getNode())) |
2160 | return Op; |
2161 | |
2162 | // Extract individual elements and select one of them. |
2163 | assert(Isv2x16VT(VectorVT) && "Unexpected vector type." ); |
2164 | EVT EltVT = VectorVT.getVectorElementType(); |
2165 | |
2166 | SDLoc dl(Op.getNode()); |
2167 | SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: Vector, |
2168 | N2: DAG.getIntPtrConstant(Val: 0, DL: dl)); |
2169 | SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: Vector, |
2170 | N2: DAG.getIntPtrConstant(Val: 1, DL: dl)); |
2171 | return DAG.getSelectCC(DL: dl, LHS: Index, RHS: DAG.getIntPtrConstant(Val: 0, DL: dl), True: E0, False: E1, |
2172 | Cond: ISD::CondCode::SETEQ); |
2173 | } |
2174 | |
2175 | SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, |
2176 | SelectionDAG &DAG) const { |
2177 | SDValue Vector = Op->getOperand(Num: 0); |
2178 | EVT VectorVT = Vector.getValueType(); |
2179 | |
2180 | if (VectorVT != MVT::v4i8) |
2181 | return Op; |
2182 | SDLoc DL(Op); |
2183 | SDValue Value = Op->getOperand(Num: 1); |
2184 | if (Value->isUndef()) |
2185 | return Vector; |
2186 | |
2187 | SDValue Index = Op->getOperand(Num: 2); |
2188 | |
2189 | SDValue BFI = |
2190 | DAG.getNode(Opcode: NVPTXISD::BFI, DL, VT: MVT::i32, |
2191 | Ops: {DAG.getZExtOrTrunc(Op: Value, DL, VT: MVT::i32), Vector, |
2192 | DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i32, |
2193 | N1: DAG.getZExtOrTrunc(Op: Index, DL, VT: MVT::i32), |
2194 | N2: DAG.getConstant(Val: 8, DL, VT: MVT::i32)), |
2195 | DAG.getConstant(Val: 8, DL, VT: MVT::i32)}); |
2196 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op->getValueType(ResNo: 0), Operand: BFI); |
2197 | } |
2198 | |
2199 | SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, |
2200 | SelectionDAG &DAG) const { |
2201 | SDValue V1 = Op.getOperand(i: 0); |
2202 | EVT VectorVT = V1.getValueType(); |
2203 | if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8) |
2204 | return Op; |
2205 | |
2206 | // Lower shuffle to PRMT instruction. |
2207 | const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode()); |
2208 | SDValue V2 = Op.getOperand(i: 1); |
2209 | uint32_t Selector = 0; |
2210 | for (auto I : llvm::enumerate(First: SVN->getMask())) { |
2211 | if (I.value() != -1) // -1 is a placeholder for undef. |
2212 | Selector |= (I.value() << (I.index() * 4)); |
2213 | } |
2214 | |
2215 | SDLoc DL(Op); |
2216 | return DAG.getNode(Opcode: NVPTXISD::PRMT, DL, VT: MVT::v4i8, N1: V1, N2: V2, |
2217 | N3: DAG.getConstant(Val: Selector, DL, VT: MVT::i32), |
2218 | N4: DAG.getConstant(Val: NVPTX::PTXPrmtMode::NONE, DL, VT: MVT::i32)); |
2219 | } |
2220 | /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which |
2221 | /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift |
2222 | /// amount, or |
2223 | /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift |
2224 | /// amount. |
2225 | SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, |
2226 | SelectionDAG &DAG) const { |
2227 | assert(Op.getNumOperands() == 3 && "Not a double-shift!" ); |
2228 | assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); |
2229 | |
2230 | EVT VT = Op.getValueType(); |
2231 | unsigned VTBits = VT.getSizeInBits(); |
2232 | SDLoc dl(Op); |
2233 | SDValue ShOpLo = Op.getOperand(i: 0); |
2234 | SDValue ShOpHi = Op.getOperand(i: 1); |
2235 | SDValue ShAmt = Op.getOperand(i: 2); |
2236 | unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; |
2237 | |
2238 | if (VTBits == 32 && STI.getSmVersion() >= 35) { |
2239 | // For 32bit and sm35, we can use the funnel shift 'shf' instruction. |
2240 | // {dHi, dLo} = {aHi, aLo} >> Amt |
2241 | // dHi = aHi >> Amt |
2242 | // dLo = shf.r.clamp aLo, aHi, Amt |
2243 | |
2244 | SDValue Hi = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ShAmt); |
2245 | SDValue Lo = |
2246 | DAG.getNode(Opcode: NVPTXISD::FSHR_CLAMP, DL: dl, VT, N1: ShOpHi, N2: ShOpLo, N3: ShAmt); |
2247 | |
2248 | SDValue Ops[2] = { Lo, Hi }; |
2249 | return DAG.getMergeValues(Ops, dl); |
2250 | } |
2251 | else { |
2252 | // {dHi, dLo} = {aHi, aLo} >> Amt |
2253 | // - if (Amt>=size) then |
2254 | // dLo = aHi >> (Amt-size) |
2255 | // dHi = aHi >> Amt (this is either all 0 or all 1) |
2256 | // else |
2257 | // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) |
2258 | // dHi = aHi >> Amt |
2259 | |
2260 | SDValue RevShAmt = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i32, |
2261 | N1: DAG.getConstant(Val: VTBits, DL: dl, VT: MVT::i32), |
2262 | N2: ShAmt); |
2263 | SDValue Tmp1 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: ShOpLo, N2: ShAmt); |
2264 | SDValue = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i32, N1: ShAmt, |
2265 | N2: DAG.getConstant(Val: VTBits, DL: dl, VT: MVT::i32)); |
2266 | SDValue Tmp2 = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpHi, N2: RevShAmt); |
2267 | SDValue FalseVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp1, N2: Tmp2); |
2268 | SDValue TrueVal = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ExtraShAmt); |
2269 | |
2270 | SDValue Cmp = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: ShAmt, |
2271 | RHS: DAG.getConstant(Val: VTBits, DL: dl, VT: MVT::i32), |
2272 | Cond: ISD::SETGE); |
2273 | SDValue Hi = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ShAmt); |
2274 | SDValue Lo = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: Cmp, N2: TrueVal, N3: FalseVal); |
2275 | |
2276 | SDValue Ops[2] = { Lo, Hi }; |
2277 | return DAG.getMergeValues(Ops, dl); |
2278 | } |
2279 | } |
2280 | |
2281 | /// LowerShiftLeftParts - Lower SHL_PARTS, which |
2282 | /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift |
2283 | /// amount, or |
2284 | /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift |
2285 | /// amount. |
2286 | SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, |
2287 | SelectionDAG &DAG) const { |
2288 | assert(Op.getNumOperands() == 3 && "Not a double-shift!" ); |
2289 | assert(Op.getOpcode() == ISD::SHL_PARTS); |
2290 | |
2291 | EVT VT = Op.getValueType(); |
2292 | unsigned VTBits = VT.getSizeInBits(); |
2293 | SDLoc dl(Op); |
2294 | SDValue ShOpLo = Op.getOperand(i: 0); |
2295 | SDValue ShOpHi = Op.getOperand(i: 1); |
2296 | SDValue ShAmt = Op.getOperand(i: 2); |
2297 | |
2298 | if (VTBits == 32 && STI.getSmVersion() >= 35) { |
2299 | // For 32bit and sm35, we can use the funnel shift 'shf' instruction. |
2300 | // {dHi, dLo} = {aHi, aLo} << Amt |
2301 | // dHi = shf.l.clamp aLo, aHi, Amt |
2302 | // dLo = aLo << Amt |
2303 | |
2304 | SDValue Hi = |
2305 | DAG.getNode(Opcode: NVPTXISD::FSHL_CLAMP, DL: dl, VT, N1: ShOpHi, N2: ShOpLo, N3: ShAmt); |
2306 | SDValue Lo = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ShAmt); |
2307 | |
2308 | SDValue Ops[2] = { Lo, Hi }; |
2309 | return DAG.getMergeValues(Ops, dl); |
2310 | } |
2311 | else { |
2312 | // {dHi, dLo} = {aHi, aLo} << Amt |
2313 | // - if (Amt>=size) then |
2314 | // dLo = aLo << Amt (all 0) |
2315 | // dLo = aLo << (Amt-size) |
2316 | // else |
2317 | // dLo = aLo << Amt |
2318 | // dHi = (aHi << Amt) | (aLo >> (size-Amt)) |
2319 | |
2320 | SDValue RevShAmt = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i32, |
2321 | N1: DAG.getConstant(Val: VTBits, DL: dl, VT: MVT::i32), |
2322 | N2: ShAmt); |
2323 | SDValue Tmp1 = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpHi, N2: ShAmt); |
2324 | SDValue = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i32, N1: ShAmt, |
2325 | N2: DAG.getConstant(Val: VTBits, DL: dl, VT: MVT::i32)); |
2326 | SDValue Tmp2 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: ShOpLo, N2: RevShAmt); |
2327 | SDValue FalseVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp1, N2: Tmp2); |
2328 | SDValue TrueVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ExtraShAmt); |
2329 | |
2330 | SDValue Cmp = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: ShAmt, |
2331 | RHS: DAG.getConstant(Val: VTBits, DL: dl, VT: MVT::i32), |
2332 | Cond: ISD::SETGE); |
2333 | SDValue Lo = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ShAmt); |
2334 | SDValue Hi = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: Cmp, N2: TrueVal, N3: FalseVal); |
2335 | |
2336 | SDValue Ops[2] = { Lo, Hi }; |
2337 | return DAG.getMergeValues(Ops, dl); |
2338 | } |
2339 | } |
2340 | |
2341 | /// If the types match, convert the generic copysign to the NVPTXISD version, |
2342 | /// otherwise bail ensuring that mismatched cases are properly expaned. |
2343 | SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op, |
2344 | SelectionDAG &DAG) const { |
2345 | EVT VT = Op.getValueType(); |
2346 | SDLoc DL(Op); |
2347 | |
2348 | SDValue In1 = Op.getOperand(i: 0); |
2349 | SDValue In2 = Op.getOperand(i: 1); |
2350 | EVT SrcVT = In2.getValueType(); |
2351 | |
2352 | if (!SrcVT.bitsEq(VT)) |
2353 | return SDValue(); |
2354 | |
2355 | return DAG.getNode(Opcode: NVPTXISD::FCOPYSIGN, DL, VT, N1: In1, N2: In2); |
2356 | } |
2357 | |
2358 | SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { |
2359 | EVT VT = Op.getValueType(); |
2360 | |
2361 | if (VT == MVT::f32) |
2362 | return LowerFROUND32(Op, DAG); |
2363 | |
2364 | if (VT == MVT::f64) |
2365 | return LowerFROUND64(Op, DAG); |
2366 | |
2367 | llvm_unreachable("unhandled type" ); |
2368 | } |
2369 | |
2370 | // This is the the rounding method used in CUDA libdevice in C like code: |
2371 | // float roundf(float A) |
2372 | // { |
2373 | // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); |
2374 | // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; |
2375 | // return abs(A) < 0.5 ? (float)(int)A : RoundedA; |
2376 | // } |
2377 | SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, |
2378 | SelectionDAG &DAG) const { |
2379 | SDLoc SL(Op); |
2380 | SDValue A = Op.getOperand(i: 0); |
2381 | EVT VT = Op.getValueType(); |
2382 | |
2383 | SDValue AbsA = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: A); |
2384 | |
2385 | // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) |
2386 | SDValue Bitcast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: A); |
2387 | const unsigned SignBitMask = 0x80000000; |
2388 | SDValue Sign = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Bitcast, |
2389 | N2: DAG.getConstant(Val: SignBitMask, DL: SL, VT: MVT::i32)); |
2390 | const unsigned PointFiveInBits = 0x3F000000; |
2391 | SDValue PointFiveWithSignRaw = |
2392 | DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Sign, |
2393 | N2: DAG.getConstant(Val: PointFiveInBits, DL: SL, VT: MVT::i32)); |
2394 | SDValue PointFiveWithSign = |
2395 | DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: PointFiveWithSignRaw); |
2396 | SDValue AdjustedA = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: A, N2: PointFiveWithSign); |
2397 | SDValue RoundedA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: AdjustedA); |
2398 | |
2399 | // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; |
2400 | EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT); |
2401 | SDValue IsLarge = |
2402 | DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, RHS: DAG.getConstantFP(Val: pow(x: 2.0, y: 23.0), DL: SL, VT), |
2403 | Cond: ISD::SETOGT); |
2404 | RoundedA = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLarge, N2: A, N3: RoundedA); |
2405 | |
2406 | // return abs(A) < 0.5 ? (float)(int)A : RoundedA; |
2407 | SDValue IsSmall =DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, |
2408 | RHS: DAG.getConstantFP(Val: 0.5, DL: SL, VT), Cond: ISD::SETOLT); |
2409 | SDValue RoundedAForSmallA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: A); |
2410 | return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsSmall, N2: RoundedAForSmallA, N3: RoundedA); |
2411 | } |
2412 | |
2413 | // The implementation of round(double) is similar to that of round(float) in |
2414 | // that they both separate the value range into three regions and use a method |
2415 | // specific to the region to round the values. However, round(double) first |
2416 | // calculates the round of the absolute value and then adds the sign back while |
2417 | // round(float) directly rounds the value with sign. |
2418 | SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, |
2419 | SelectionDAG &DAG) const { |
2420 | SDLoc SL(Op); |
2421 | SDValue A = Op.getOperand(i: 0); |
2422 | EVT VT = Op.getValueType(); |
2423 | |
2424 | SDValue AbsA = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: A); |
2425 | |
2426 | // double RoundedA = (double) (int) (abs(A) + 0.5f); |
2427 | SDValue AdjustedA = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: AbsA, |
2428 | N2: DAG.getConstantFP(Val: 0.5, DL: SL, VT)); |
2429 | SDValue RoundedA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: AdjustedA); |
2430 | |
2431 | // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; |
2432 | EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT); |
2433 | SDValue IsSmall =DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, |
2434 | RHS: DAG.getConstantFP(Val: 0.5, DL: SL, VT), Cond: ISD::SETOLT); |
2435 | RoundedA = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsSmall, |
2436 | N2: DAG.getConstantFP(Val: 0, DL: SL, VT), |
2437 | N3: RoundedA); |
2438 | |
2439 | // Add sign to rounded_A |
2440 | RoundedA = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: RoundedA, N2: A); |
2441 | DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: A); |
2442 | |
2443 | // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; |
2444 | SDValue IsLarge = |
2445 | DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, RHS: DAG.getConstantFP(Val: pow(x: 2.0, y: 52.0), DL: SL, VT), |
2446 | Cond: ISD::SETOGT); |
2447 | return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLarge, N2: A, N3: RoundedA); |
2448 | } |
2449 | |
2450 | static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG) { |
2451 | EVT VT = N->getValueType(ResNo: 0); |
2452 | EVT NVT = MVT::f32; |
2453 | if (VT.isVector()) { |
2454 | NVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: NVT, EC: VT.getVectorElementCount()); |
2455 | } |
2456 | SDLoc DL(N); |
2457 | SDValue Tmp0 = DAG.getFPExtendOrRound(Op: N->getOperand(Num: 0), DL, VT: NVT); |
2458 | SDValue Tmp1 = DAG.getFPExtendOrRound(Op: N->getOperand(Num: 1), DL, VT: NVT); |
2459 | SDValue Res = DAG.getNode(Opcode: N->getOpcode(), DL, VT: NVT, N1: Tmp0, N2: Tmp1, Flags: N->getFlags()); |
2460 | return DAG.getFPExtendOrRound(Op: Res, DL, VT); |
2461 | } |
2462 | |
2463 | SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op, |
2464 | SelectionDAG &DAG) const { |
2465 | if (useF32FTZ(MF: DAG.getMachineFunction())) { |
2466 | return PromoteBinOpToF32(N: Op.getNode(), DAG); |
2467 | } |
2468 | return Op; |
2469 | } |
2470 | |
2471 | SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op, |
2472 | SelectionDAG &DAG) const { |
2473 | assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); |
2474 | |
2475 | if (Op.getValueType() == MVT::bf16) { |
2476 | SDLoc Loc(Op); |
2477 | return DAG.getNode( |
2478 | Opcode: ISD::FP_ROUND, DL: Loc, VT: MVT::bf16, |
2479 | N1: DAG.getNode(Opcode: Op.getOpcode(), DL: Loc, VT: MVT::f32, Operand: Op.getOperand(i: 0)), |
2480 | N2: DAG.getIntPtrConstant(Val: 0, DL: Loc, /*isTarget=*/true)); |
2481 | } |
2482 | |
2483 | // Everything else is considered legal. |
2484 | return Op; |
2485 | } |
2486 | |
2487 | SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op, |
2488 | SelectionDAG &DAG) const { |
2489 | assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); |
2490 | |
2491 | if (Op.getOperand(i: 0).getValueType() == MVT::bf16) { |
2492 | SDLoc Loc(Op); |
2493 | return DAG.getNode( |
2494 | Opcode: Op.getOpcode(), DL: Loc, VT: Op.getValueType(), |
2495 | Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: Loc, VT: MVT::f32, Operand: Op.getOperand(i: 0))); |
2496 | } |
2497 | |
2498 | // Everything else is considered legal. |
2499 | return Op; |
2500 | } |
2501 | |
2502 | SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op, |
2503 | SelectionDAG &DAG) const { |
2504 | EVT NarrowVT = Op.getValueType(); |
2505 | SDValue Wide = Op.getOperand(i: 0); |
2506 | EVT WideVT = Wide.getValueType(); |
2507 | if (NarrowVT.getScalarType() == MVT::bf16) { |
2508 | const TargetLowering *TLI = STI.getTargetLowering(); |
2509 | if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) { |
2510 | return TLI->expandFP_ROUND(Node: Op.getNode(), DAG); |
2511 | } |
2512 | if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { |
2513 | // This combination was the first to support f32 -> bf16. |
2514 | if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) { |
2515 | if (WideVT.getScalarType() == MVT::f32) { |
2516 | return Op; |
2517 | } |
2518 | if (WideVT.getScalarType() == MVT::f64) { |
2519 | SDLoc Loc(Op); |
2520 | // Round-inexact-to-odd f64 to f32, then do the final rounding using |
2521 | // the hardware f32 -> bf16 instruction. |
2522 | SDValue rod = TLI->expandRoundInexactToOdd( |
2523 | ResultVT: WideVT.isVector() ? WideVT.changeVectorElementType(EltVT: MVT::f32) |
2524 | : MVT::f32, |
2525 | Op: Wide, DL: Loc, DAG); |
2526 | return DAG.getFPExtendOrRound(Op: rod, DL: Loc, VT: NarrowVT); |
2527 | } |
2528 | } |
2529 | return TLI->expandFP_ROUND(Node: Op.getNode(), DAG); |
2530 | } |
2531 | } |
2532 | |
2533 | // Everything else is considered legal. |
2534 | return Op; |
2535 | } |
2536 | |
2537 | SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op, |
2538 | SelectionDAG &DAG) const { |
2539 | SDValue Narrow = Op.getOperand(i: 0); |
2540 | EVT NarrowVT = Narrow.getValueType(); |
2541 | EVT WideVT = Op.getValueType(); |
2542 | if (NarrowVT.getScalarType() == MVT::bf16) { |
2543 | if (WideVT.getScalarType() == MVT::f32 && |
2544 | (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) { |
2545 | SDLoc Loc(Op); |
2546 | return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: Loc, VT: WideVT, Operand: Narrow); |
2547 | } |
2548 | if (WideVT.getScalarType() == MVT::f64 && |
2549 | (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { |
2550 | EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(EltVT: MVT::f32) |
2551 | : MVT::f32; |
2552 | SDLoc Loc(Op); |
2553 | if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) { |
2554 | Op = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: Loc, VT: F32, Operand: Narrow); |
2555 | } else { |
2556 | Op = DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: Loc, VT: F32, Operand: Narrow); |
2557 | } |
2558 | return DAG.getNode(Opcode: ISD::FP_EXTEND, DL: Loc, VT: WideVT, Operand: Op); |
2559 | } |
2560 | } |
2561 | |
2562 | // Everything else is considered legal. |
2563 | return Op; |
2564 | } |
2565 | |
2566 | static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { |
2567 | SDLoc DL(Op); |
2568 | if (Op.getValueType() != MVT::v2i16) |
2569 | return Op; |
2570 | EVT EltVT = Op.getValueType().getVectorElementType(); |
2571 | SmallVector<SDValue> VecElements; |
2572 | for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { |
2573 | SmallVector<SDValue> ScalarArgs; |
2574 | llvm::transform(Range: Op->ops(), d_first: std::back_inserter(x&: ScalarArgs), |
2575 | F: [&](const SDUse &O) { |
2576 | return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, |
2577 | N1: O.get(), N2: DAG.getIntPtrConstant(Val: I, DL)); |
2578 | }); |
2579 | VecElements.push_back(Elt: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: EltVT, Ops: ScalarArgs)); |
2580 | } |
2581 | SDValue V = |
2582 | DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: Op.getValueType(), Ops: VecElements); |
2583 | return V; |
2584 | } |
2585 | |
2586 | static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG) { |
2587 | SDNode *N = Op.getNode(); |
2588 | SDLoc DL(N); |
2589 | SmallVector<SDValue, 32> Ops; |
2590 | |
2591 | // split the vector argument |
2592 | for (size_t I = 0; I < N->getNumOperands(); I++) { |
2593 | SDValue Val = N->getOperand(Num: I); |
2594 | EVT ValVT = Val.getValueType(); |
2595 | if (ValVT.isVector()) { |
2596 | EVT EltVT = ValVT.getVectorElementType(); |
2597 | for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++) |
2598 | Ops.push_back(Elt: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: Val, |
2599 | N2: DAG.getIntPtrConstant(Val: J, DL))); |
2600 | } else |
2601 | Ops.push_back(Elt: Val); |
2602 | } |
2603 | |
2604 | MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(Val: N); |
2605 | SDValue Tcgen05StNode = |
2606 | DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_VOID, dl: DL, VTList: N->getVTList(), Ops, |
2607 | MemVT: MemSD->getMemoryVT(), MMO: MemSD->getMemOperand()); |
2608 | |
2609 | return Tcgen05StNode; |
2610 | } |
2611 | |
2612 | static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { |
2613 | SDNode *N = Op.getNode(); |
2614 | SDValue Intrin = N->getOperand(Num: 1); |
2615 | |
2616 | // Get the intrinsic ID |
2617 | unsigned IntrinNo = cast<ConstantSDNode>(Val: Intrin.getNode())->getZExtValue(); |
2618 | switch (IntrinNo) { |
2619 | default: |
2620 | break; |
2621 | case Intrinsic::nvvm_tcgen05_st_16x64b_x1: |
2622 | case Intrinsic::nvvm_tcgen05_st_16x64b_x2: |
2623 | case Intrinsic::nvvm_tcgen05_st_16x64b_x4: |
2624 | case Intrinsic::nvvm_tcgen05_st_16x64b_x8: |
2625 | case Intrinsic::nvvm_tcgen05_st_16x64b_x16: |
2626 | case Intrinsic::nvvm_tcgen05_st_16x64b_x32: |
2627 | case Intrinsic::nvvm_tcgen05_st_16x64b_x128: |
2628 | case Intrinsic::nvvm_tcgen05_st_16x128b_x1: |
2629 | case Intrinsic::nvvm_tcgen05_st_16x128b_x2: |
2630 | case Intrinsic::nvvm_tcgen05_st_16x128b_x4: |
2631 | case Intrinsic::nvvm_tcgen05_st_16x128b_x8: |
2632 | case Intrinsic::nvvm_tcgen05_st_16x128b_x16: |
2633 | case Intrinsic::nvvm_tcgen05_st_16x128b_x32: |
2634 | case Intrinsic::nvvm_tcgen05_st_16x128b_x64: |
2635 | case Intrinsic::nvvm_tcgen05_st_16x256b_x1: |
2636 | case Intrinsic::nvvm_tcgen05_st_16x256b_x2: |
2637 | case Intrinsic::nvvm_tcgen05_st_16x256b_x4: |
2638 | case Intrinsic::nvvm_tcgen05_st_16x256b_x8: |
2639 | case Intrinsic::nvvm_tcgen05_st_16x256b_x16: |
2640 | case Intrinsic::nvvm_tcgen05_st_16x256b_x32: |
2641 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: |
2642 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: |
2643 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: |
2644 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: |
2645 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: |
2646 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: |
2647 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: |
2648 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: |
2649 | case Intrinsic::nvvm_tcgen05_st_32x32b_x1: |
2650 | case Intrinsic::nvvm_tcgen05_st_32x32b_x2: |
2651 | case Intrinsic::nvvm_tcgen05_st_32x32b_x4: |
2652 | case Intrinsic::nvvm_tcgen05_st_32x32b_x8: |
2653 | case Intrinsic::nvvm_tcgen05_st_32x32b_x16: |
2654 | case Intrinsic::nvvm_tcgen05_st_32x32b_x32: |
2655 | case Intrinsic::nvvm_tcgen05_st_16x64b_x64: |
2656 | case Intrinsic::nvvm_tcgen05_st_32x32b_x64: |
2657 | case Intrinsic::nvvm_tcgen05_st_32x32b_x128: |
2658 | return LowerTcgen05St(Op, DAG); |
2659 | } |
2660 | return Op; |
2661 | } |
2662 | |
2663 | static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, |
2664 | SelectionDAG &DAG) { |
2665 | |
2666 | SDNode *N = Op.getNode(); |
2667 | if (N->getOperand(Num: 1).getValueType() != MVT::i128) { |
2668 | // return, if the operand is already lowered |
2669 | return SDValue(); |
2670 | } |
2671 | |
2672 | unsigned IID = |
2673 | cast<ConstantSDNode>(Val: N->getOperand(Num: 0).getNode())->getZExtValue(); |
2674 | auto Opcode = [&]() { |
2675 | switch (IID) { |
2676 | case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled: |
2677 | return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED; |
2678 | case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x: |
2679 | return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X; |
2680 | case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y: |
2681 | return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y; |
2682 | case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z: |
2683 | return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z; |
2684 | default: |
2685 | llvm_unreachable("unsupported/unhandled intrinsic" ); |
2686 | } |
2687 | }(); |
2688 | |
2689 | SDLoc DL(N); |
2690 | SDValue TryCancelResponse = N->getOperand(Num: 1); |
2691 | SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i64, Operand: TryCancelResponse); |
2692 | SDValue TryCancelResponse0 = |
2693 | DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64, N1: Cast, |
2694 | N2: DAG.getIntPtrConstant(Val: 0, DL)); |
2695 | SDValue TryCancelResponse1 = |
2696 | DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64, N1: Cast, |
2697 | N2: DAG.getIntPtrConstant(Val: 1, DL)); |
2698 | |
2699 | return DAG.getNode(Opcode, DL, VTList: N->getVTList(), |
2700 | Ops: {TryCancelResponse0, TryCancelResponse1}); |
2701 | } |
2702 | |
2703 | static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { |
2704 | switch (Op->getConstantOperandVal(Num: 0)) { |
2705 | default: |
2706 | return Op; |
2707 | case Intrinsic::nvvm_internal_addrspace_wrap: |
2708 | return Op.getOperand(i: 1); |
2709 | case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled: |
2710 | case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x: |
2711 | case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y: |
2712 | case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z: |
2713 | return LowerClusterLaunchControlQueryCancel(Op, DAG); |
2714 | } |
2715 | } |
2716 | |
2717 | // In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value. |
2718 | // Lower these into a node returning the correct type which is zero-extended |
2719 | // back to the correct size. |
2720 | static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) { |
2721 | SDValue V = Op->getOperand(Num: 0); |
2722 | assert(V.getValueType() == MVT::i64 && |
2723 | "Unexpected CTLZ/CTPOP type to legalize" ); |
2724 | |
2725 | SDLoc DL(Op); |
2726 | SDValue CT = DAG.getNode(Opcode: Op->getOpcode(), DL, VT: MVT::i32, Operand: V); |
2727 | return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: CT, Flags: SDNodeFlags::NonNeg); |
2728 | } |
2729 | |
2730 | static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, |
2731 | unsigned Opcode, SelectionDAG &DAG) { |
2732 | assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64); |
2733 | |
2734 | const auto *AmtConst = dyn_cast<ConstantSDNode>(Val&: ShiftAmount); |
2735 | if (!AmtConst) |
2736 | return SDValue(); |
2737 | const auto Amt = AmtConst->getZExtValue() & 63; |
2738 | |
2739 | SDValue UnpackA = |
2740 | DAG.getNode(Opcode: NVPTXISD::UNPACK_VECTOR, DL, ResultTys: {MVT::i32, MVT::i32}, Ops: A); |
2741 | SDValue UnpackB = |
2742 | DAG.getNode(Opcode: NVPTXISD::UNPACK_VECTOR, DL, ResultTys: {MVT::i32, MVT::i32}, Ops: B); |
2743 | |
2744 | // Arch is Little endiain: 0 = low bits, 1 = high bits |
2745 | SDValue ALo = UnpackA.getValue(R: 0); |
2746 | SDValue AHi = UnpackA.getValue(R: 1); |
2747 | SDValue BLo = UnpackB.getValue(R: 0); |
2748 | SDValue BHi = UnpackB.getValue(R: 1); |
2749 | |
2750 | // The bitfeild consists of { AHi : ALo : BHi : BLo } |
2751 | // |
2752 | // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi } |
2753 | // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo } |
2754 | // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo } |
2755 | // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi } |
2756 | // |
2757 | // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts |
2758 | // are not needed at all. Amt = 0 is a no-op producing either A or B depending |
2759 | // on the direction. Amt = 32 can be implemented by a packing and unpacking |
2760 | // move to select and arrange the 32bit values. For simplicity, these cases |
2761 | // are not handled here explicitly and instead we rely on DAGCombiner to |
2762 | // remove the no-op funnel shifts we insert. |
2763 | auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32)) |
2764 | ? std::make_tuple(args&: AHi, args&: ALo, args&: BHi) |
2765 | : std::make_tuple(args&: ALo, args&: BHi, args&: BLo); |
2766 | |
2767 | SDValue NewAmt = DAG.getConstant(Val: Amt & 31, DL, VT: MVT::i32); |
2768 | SDValue RHi = DAG.getNode(Opcode, DL, VT: MVT::i32, Ops: {High, Mid, NewAmt}); |
2769 | SDValue RLo = DAG.getNode(Opcode, DL, VT: MVT::i32, Ops: {Mid, Low, NewAmt}); |
2770 | |
2771 | return DAG.getNode(Opcode: NVPTXISD::BUILD_VECTOR, DL, VT: MVT::i64, Ops: {RLo, RHi}); |
2772 | } |
2773 | |
2774 | static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG) { |
2775 | return expandFSH64(A: Op->getOperand(Num: 0), B: Op->getOperand(Num: 1), ShiftAmount: Op->getOperand(Num: 2), |
2776 | DL: SDLoc(Op), Opcode: Op->getOpcode(), DAG); |
2777 | } |
2778 | |
2779 | static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) { |
2780 | unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR; |
2781 | return expandFSH64(A: Op->getOperand(Num: 0), B: Op->getOperand(Num: 0), ShiftAmount: Op->getOperand(Num: 1), |
2782 | DL: SDLoc(Op), Opcode, DAG); |
2783 | } |
2784 | |
2785 | static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG, |
2786 | bool AllowUnsafeFPMath) { |
2787 | // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)), |
2788 | // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches |
2789 | // the semantics of LLVM's frem. |
2790 | SDLoc DL(Op); |
2791 | SDValue X = Op->getOperand(Num: 0); |
2792 | SDValue Y = Op->getOperand(Num: 1); |
2793 | EVT Ty = Op.getValueType(); |
2794 | SDNodeFlags Flags = Op->getFlags(); |
2795 | |
2796 | SDValue Div = DAG.getNode(Opcode: ISD::FDIV, DL, VT: Ty, N1: X, N2: Y, Flags); |
2797 | SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: Ty, Operand: Div, Flags); |
2798 | SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: Ty, N1: Trunc, N2: Y, |
2799 | Flags: Flags | SDNodeFlags::AllowContract); |
2800 | SDValue Sub = DAG.getNode(Opcode: ISD::FSUB, DL, VT: Ty, N1: X, N2: Mul, |
2801 | Flags: Flags | SDNodeFlags::AllowContract); |
2802 | |
2803 | if (AllowUnsafeFPMath || Flags.hasNoInfs()) |
2804 | return Sub; |
2805 | |
2806 | // If Y is infinite, return X |
2807 | SDValue AbsY = DAG.getNode(Opcode: ISD::FABS, DL, VT: Ty, Operand: Y); |
2808 | SDValue Inf = |
2809 | DAG.getConstantFP(Val: APFloat::getInf(Sem: Ty.getFltSemantics()), DL, VT: Ty); |
2810 | SDValue IsInf = DAG.getSetCC(DL, VT: MVT::i1, LHS: AbsY, RHS: Inf, Cond: ISD::SETEQ); |
2811 | return DAG.getSelect(DL, VT: Ty, Cond: IsInf, LHS: X, RHS: Sub); |
2812 | } |
2813 | |
2814 | static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) { |
2815 | assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1" ); |
2816 | |
2817 | SDValue Cond = Op->getOperand(Num: 0); |
2818 | SDValue TrueVal = Op->getOperand(Num: 1); |
2819 | SDValue FalseVal = Op->getOperand(Num: 2); |
2820 | SDLoc DL(Op); |
2821 | |
2822 | // If both operands are truncated, we push the select through the truncates. |
2823 | if (TrueVal.getOpcode() == ISD::TRUNCATE && |
2824 | FalseVal.getOpcode() == ISD::TRUNCATE) { |
2825 | TrueVal = TrueVal.getOperand(i: 0); |
2826 | FalseVal = FalseVal.getOperand(i: 0); |
2827 | |
2828 | EVT VT = TrueVal.getSimpleValueType().bitsLE(VT: FalseVal.getSimpleValueType()) |
2829 | ? TrueVal.getValueType() |
2830 | : FalseVal.getValueType(); |
2831 | TrueVal = DAG.getAnyExtOrTrunc(Op: TrueVal, DL, VT); |
2832 | FalseVal = DAG.getAnyExtOrTrunc(Op: FalseVal, DL, VT); |
2833 | SDValue Select = DAG.getSelect(DL, VT, Cond, LHS: TrueVal, RHS: FalseVal); |
2834 | return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Select); |
2835 | } |
2836 | |
2837 | // Otherwise, expand the select into a series of logical operations. These |
2838 | // often can be folded into other operations either by us or ptxas. |
2839 | TrueVal = DAG.getFreeze(V: TrueVal); |
2840 | FalseVal = DAG.getFreeze(V: FalseVal); |
2841 | SDValue And1 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i1, N1: Cond, N2: TrueVal); |
2842 | SDValue NotCond = DAG.getNOT(DL, Val: Cond, VT: MVT::i1); |
2843 | SDValue And2 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i1, N1: NotCond, N2: FalseVal); |
2844 | SDValue Or = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i1, N1: And1, N2: And2); |
2845 | return Or; |
2846 | } |
2847 | |
2848 | SDValue |
2849 | NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { |
2850 | switch (Op.getOpcode()) { |
2851 | case ISD::RETURNADDR: |
2852 | return SDValue(); |
2853 | case ISD::FRAMEADDR: |
2854 | return SDValue(); |
2855 | case ISD::ADDRSPACECAST: |
2856 | return LowerADDRSPACECAST(Op, DAG); |
2857 | case ISD::INTRINSIC_W_CHAIN: |
2858 | return Op; |
2859 | case ISD::INTRINSIC_WO_CHAIN: |
2860 | return lowerIntrinsicWOChain(Op, DAG); |
2861 | case ISD::INTRINSIC_VOID: |
2862 | return LowerIntrinsicVoid(Op, DAG); |
2863 | case ISD::BUILD_VECTOR: |
2864 | return LowerBUILD_VECTOR(Op, DAG); |
2865 | case ISD::BITCAST: |
2866 | return LowerBITCAST(Op, DAG); |
2867 | case ISD::EXTRACT_SUBVECTOR: |
2868 | return Op; |
2869 | case ISD::EXTRACT_VECTOR_ELT: |
2870 | return LowerEXTRACT_VECTOR_ELT(Op, DAG); |
2871 | case ISD::INSERT_VECTOR_ELT: |
2872 | return LowerINSERT_VECTOR_ELT(Op, DAG); |
2873 | case ISD::VECTOR_SHUFFLE: |
2874 | return LowerVECTOR_SHUFFLE(Op, DAG); |
2875 | case ISD::CONCAT_VECTORS: |
2876 | return LowerCONCAT_VECTORS(Op, DAG); |
2877 | case ISD::STORE: |
2878 | return LowerSTORE(Op, DAG); |
2879 | case ISD::LOAD: |
2880 | return LowerLOAD(Op, DAG); |
2881 | case ISD::SHL_PARTS: |
2882 | return LowerShiftLeftParts(Op, DAG); |
2883 | case ISD::SRA_PARTS: |
2884 | case ISD::SRL_PARTS: |
2885 | return LowerShiftRightParts(Op, DAG); |
2886 | case ISD::SELECT: |
2887 | return lowerSELECT(Op, DAG); |
2888 | case ISD::FROUND: |
2889 | return LowerFROUND(Op, DAG); |
2890 | case ISD::FCOPYSIGN: |
2891 | return LowerFCOPYSIGN(Op, DAG); |
2892 | case ISD::SINT_TO_FP: |
2893 | case ISD::UINT_TO_FP: |
2894 | return LowerINT_TO_FP(Op, DAG); |
2895 | case ISD::FP_TO_SINT: |
2896 | case ISD::FP_TO_UINT: |
2897 | return LowerFP_TO_INT(Op, DAG); |
2898 | case ISD::FP_ROUND: |
2899 | return LowerFP_ROUND(Op, DAG); |
2900 | case ISD::FP_EXTEND: |
2901 | return LowerFP_EXTEND(Op, DAG); |
2902 | case ISD::BR_JT: |
2903 | return LowerBR_JT(Op, DAG); |
2904 | case ISD::VAARG: |
2905 | return LowerVAARG(Op, DAG); |
2906 | case ISD::VASTART: |
2907 | return LowerVASTART(Op, DAG); |
2908 | case ISD::FSHL: |
2909 | case ISD::FSHR: |
2910 | return lowerFSH(Op, DAG); |
2911 | case ISD::ROTL: |
2912 | case ISD::ROTR: |
2913 | return lowerROT(Op, DAG); |
2914 | case ISD::ABS: |
2915 | case ISD::SMIN: |
2916 | case ISD::SMAX: |
2917 | case ISD::UMIN: |
2918 | case ISD::UMAX: |
2919 | case ISD::ADD: |
2920 | case ISD::SUB: |
2921 | case ISD::MUL: |
2922 | case ISD::SHL: |
2923 | case ISD::SREM: |
2924 | case ISD::UREM: |
2925 | return LowerVectorArith(Op, DAG); |
2926 | case ISD::DYNAMIC_STACKALLOC: |
2927 | return LowerDYNAMIC_STACKALLOC(Op, DAG); |
2928 | case ISD::STACKRESTORE: |
2929 | return LowerSTACKRESTORE(Op, DAG); |
2930 | case ISD::STACKSAVE: |
2931 | return LowerSTACKSAVE(Op, DAG); |
2932 | case ISD::CopyToReg: |
2933 | return LowerCopyToReg_128(Op, DAG); |
2934 | case ISD::FADD: |
2935 | case ISD::FSUB: |
2936 | case ISD::FMUL: |
2937 | // Used only for bf16 on SM80, where we select fma for non-ftz operation |
2938 | return PromoteBinOpIfF32FTZ(Op, DAG); |
2939 | case ISD::CTPOP: |
2940 | case ISD::CTLZ: |
2941 | return lowerCTLZCTPOP(Op, DAG); |
2942 | case ISD::FREM: |
2943 | return lowerFREM(Op, DAG, AllowUnsafeFPMath: allowUnsafeFPMath(MF: DAG.getMachineFunction())); |
2944 | |
2945 | default: |
2946 | llvm_unreachable("Custom lowering not defined for operation" ); |
2947 | } |
2948 | } |
2949 | |
2950 | SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { |
2951 | SDLoc DL(Op); |
2952 | SDValue Chain = Op.getOperand(i: 0); |
2953 | const auto *JT = cast<JumpTableSDNode>(Val: Op.getOperand(i: 1)); |
2954 | SDValue Index = Op.getOperand(i: 2); |
2955 | |
2956 | unsigned JId = JT->getIndex(); |
2957 | MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo(); |
2958 | ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs; |
2959 | |
2960 | SDValue IdV = DAG.getConstant(Val: JId, DL, VT: MVT::i32); |
2961 | |
2962 | // Generate BrxStart node |
2963 | SDVTList VTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue); |
2964 | Chain = DAG.getNode(Opcode: NVPTXISD::BrxStart, DL, VTList: VTs, N1: Chain, N2: IdV); |
2965 | |
2966 | // Generate BrxItem nodes |
2967 | assert(!MBBs.empty()); |
2968 | for (MachineBasicBlock *MBB : MBBs.drop_back()) |
2969 | Chain = DAG.getNode(Opcode: NVPTXISD::BrxItem, DL, VTList: VTs, N1: Chain.getValue(R: 0), |
2970 | N2: DAG.getBasicBlock(MBB), N3: Chain.getValue(R: 1)); |
2971 | |
2972 | // Generate BrxEnd nodes |
2973 | SDValue EndOps[] = {Chain.getValue(R: 0), DAG.getBasicBlock(MBB: MBBs.back()), Index, |
2974 | IdV, Chain.getValue(R: 1)}; |
2975 | SDValue BrxEnd = DAG.getNode(Opcode: NVPTXISD::BrxEnd, DL, VTList: VTs, Ops: EndOps); |
2976 | |
2977 | return BrxEnd; |
2978 | } |
2979 | |
2980 | // This will prevent AsmPrinter from trying to print the jump tables itself. |
2981 | unsigned NVPTXTargetLowering::getJumpTableEncoding() const { |
2982 | return MachineJumpTableInfo::EK_Inline; |
2983 | } |
2984 | |
2985 | SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op, |
2986 | SelectionDAG &DAG) const { |
2987 | AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Val: Op.getNode()); |
2988 | unsigned SrcAS = N->getSrcAddressSpace(); |
2989 | unsigned DestAS = N->getDestAddressSpace(); |
2990 | if (SrcAS != llvm::ADDRESS_SPACE_GENERIC && |
2991 | DestAS != llvm::ADDRESS_SPACE_GENERIC) { |
2992 | // Shared and SharedCluster can be converted to each other through generic |
2993 | // space |
2994 | if ((SrcAS == llvm::ADDRESS_SPACE_SHARED && |
2995 | DestAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER) || |
2996 | (SrcAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER && |
2997 | DestAS == llvm::ADDRESS_SPACE_SHARED)) { |
2998 | SDLoc DL(Op.getNode()); |
2999 | const MVT GenerictVT = |
3000 | getPointerTy(DL: DAG.getDataLayout(), AS: ADDRESS_SPACE_GENERIC); |
3001 | SDValue GenericConversion = DAG.getAddrSpaceCast( |
3002 | dl: DL, VT: GenerictVT, Ptr: Op.getOperand(i: 0), SrcAS, DestAS: ADDRESS_SPACE_GENERIC); |
3003 | SDValue SharedClusterConversion = |
3004 | DAG.getAddrSpaceCast(dl: DL, VT: Op.getValueType(), Ptr: GenericConversion, |
3005 | SrcAS: ADDRESS_SPACE_GENERIC, DestAS); |
3006 | return SharedClusterConversion; |
3007 | } |
3008 | |
3009 | return DAG.getUNDEF(VT: Op.getValueType()); |
3010 | } |
3011 | |
3012 | return Op; |
3013 | } |
3014 | |
3015 | // This function is almost a copy of SelectionDAG::expandVAArg(). |
3016 | // The only diff is that this one produces loads from local address space. |
3017 | SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { |
3018 | const TargetLowering *TLI = STI.getTargetLowering(); |
3019 | SDLoc DL(Op); |
3020 | |
3021 | SDNode *Node = Op.getNode(); |
3022 | const Value *V = cast<SrcValueSDNode>(Val: Node->getOperand(Num: 2))->getValue(); |
3023 | EVT VT = Node->getValueType(ResNo: 0); |
3024 | auto *Ty = VT.getTypeForEVT(Context&: *DAG.getContext()); |
3025 | SDValue Tmp1 = Node->getOperand(Num: 0); |
3026 | SDValue Tmp2 = Node->getOperand(Num: 1); |
3027 | const MaybeAlign MA(Node->getConstantOperandVal(Num: 3)); |
3028 | |
3029 | SDValue VAListLoad = DAG.getLoad(VT: TLI->getPointerTy(DL: DAG.getDataLayout()), dl: DL, |
3030 | Chain: Tmp1, Ptr: Tmp2, PtrInfo: MachinePointerInfo(V)); |
3031 | SDValue VAList = VAListLoad; |
3032 | |
3033 | if (MA && *MA > TLI->getMinStackArgumentAlignment()) { |
3034 | VAList = DAG.getNode( |
3035 | Opcode: ISD::ADD, DL, VT: VAList.getValueType(), N1: VAList, |
3036 | N2: DAG.getConstant(Val: MA->value() - 1, DL, VT: VAList.getValueType())); |
3037 | |
3038 | VAList = DAG.getNode(Opcode: ISD::AND, DL, VT: VAList.getValueType(), N1: VAList, |
3039 | N2: DAG.getSignedConstant(Val: -(int64_t)MA->value(), DL, |
3040 | VT: VAList.getValueType())); |
3041 | } |
3042 | |
3043 | // Increment the pointer, VAList, to the next vaarg |
3044 | Tmp1 = DAG.getNode(Opcode: ISD::ADD, DL, VT: VAList.getValueType(), N1: VAList, |
3045 | N2: DAG.getConstant(Val: DAG.getDataLayout().getTypeAllocSize(Ty), |
3046 | DL, VT: VAList.getValueType())); |
3047 | |
3048 | // Store the incremented VAList to the legalized pointer |
3049 | Tmp1 = DAG.getStore(Chain: VAListLoad.getValue(R: 1), dl: DL, Val: Tmp1, Ptr: Tmp2, |
3050 | PtrInfo: MachinePointerInfo(V)); |
3051 | |
3052 | const Value *SrcV = Constant::getNullValue( |
3053 | Ty: PointerType::get(C&: *DAG.getContext(), AddressSpace: ADDRESS_SPACE_LOCAL)); |
3054 | |
3055 | // Load the actual argument out of the pointer VAList |
3056 | return DAG.getLoad(VT, dl: DL, Chain: Tmp1, Ptr: VAList, PtrInfo: MachinePointerInfo(SrcV)); |
3057 | } |
3058 | |
3059 | SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { |
3060 | const TargetLowering *TLI = STI.getTargetLowering(); |
3061 | SDLoc DL(Op); |
3062 | EVT PtrVT = TLI->getPointerTy(DL: DAG.getDataLayout()); |
3063 | |
3064 | // Store the address of unsized array <function>_vararg[] in the ap object. |
3065 | SDValue VAReg = getParamSymbol(DAG, /* vararg */ I: -1, T: PtrVT); |
3066 | |
3067 | const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue(); |
3068 | return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: VAReg, Ptr: Op.getOperand(i: 1), |
3069 | PtrInfo: MachinePointerInfo(SV)); |
3070 | } |
3071 | |
3072 | SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { |
3073 | if (Op.getValueType() == MVT::i1) |
3074 | return LowerLOADi1(Op, DAG); |
3075 | |
3076 | // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle |
3077 | // unaligned loads and have to handle it here. |
3078 | EVT VT = Op.getValueType(); |
3079 | if (Isv2x16VT(VT) || VT == MVT::v4i8) { |
3080 | LoadSDNode *Load = cast<LoadSDNode>(Val&: Op); |
3081 | EVT MemVT = Load->getMemoryVT(); |
3082 | if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(), |
3083 | VT: MemVT, MMO: *Load->getMemOperand())) { |
3084 | SDValue Ops[2]; |
3085 | std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: Load, DAG); |
3086 | return DAG.getMergeValues(Ops, dl: SDLoc(Op)); |
3087 | } |
3088 | } |
3089 | |
3090 | return SDValue(); |
3091 | } |
3092 | |
3093 | // v = ld i1* addr |
3094 | // => |
3095 | // v1 = ld i8* addr (-> i16) |
3096 | // v = trunc i16 to i1 |
3097 | SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { |
3098 | SDNode *Node = Op.getNode(); |
3099 | LoadSDNode *LD = cast<LoadSDNode>(Val: Node); |
3100 | SDLoc dl(Node); |
3101 | assert(LD->getExtensionType() == ISD::NON_EXTLOAD); |
3102 | assert(Node->getValueType(0) == MVT::i1 && |
3103 | "Custom lowering for i1 load only" ); |
3104 | SDValue newLD = DAG.getExtLoad(ExtType: ISD::ZEXTLOAD, dl, VT: MVT::i16, Chain: LD->getChain(), |
3105 | Ptr: LD->getBasePtr(), PtrInfo: LD->getPointerInfo(), |
3106 | MemVT: MVT::i8, Alignment: LD->getAlign(), |
3107 | MMOFlags: LD->getMemOperand()->getFlags()); |
3108 | SDValue result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i1, Operand: newLD); |
3109 | // The legalizer (the caller) is expecting two values from the legalized |
3110 | // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() |
3111 | // in LegalizeDAG.cpp which also uses MergeValues. |
3112 | SDValue Ops[] = { result, LD->getChain() }; |
3113 | return DAG.getMergeValues(Ops, dl); |
3114 | } |
3115 | |
3116 | SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { |
3117 | StoreSDNode *Store = cast<StoreSDNode>(Val&: Op); |
3118 | EVT VT = Store->getMemoryVT(); |
3119 | |
3120 | if (VT == MVT::i1) |
3121 | return LowerSTOREi1(Op, DAG); |
3122 | |
3123 | // v2f16 is legal, so we can't rely on legalizer to handle unaligned |
3124 | // stores and have to handle it here. |
3125 | if ((Isv2x16VT(VT) || VT == MVT::v4i8) && |
3126 | !allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(), |
3127 | VT, MMO: *Store->getMemOperand())) |
3128 | return expandUnalignedStore(ST: Store, DAG); |
3129 | |
3130 | // v2f16, v2bf16 and v2i16 don't need special handling. |
3131 | if (Isv2x16VT(VT) || VT == MVT::v4i8) |
3132 | return SDValue(); |
3133 | |
3134 | return LowerSTOREVector(Op, DAG); |
3135 | } |
3136 | |
3137 | SDValue |
3138 | NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { |
3139 | MemSDNode *N = cast<MemSDNode>(Val: Op.getNode()); |
3140 | SDValue Val = N->getOperand(Num: 1); |
3141 | SDLoc DL(N); |
3142 | const EVT ValVT = Val.getValueType(); |
3143 | const EVT MemVT = N->getMemoryVT(); |
3144 | |
3145 | // If we're truncating as part of the store, avoid lowering to a StoreV node. |
3146 | // TODO: consider relaxing this restriction. |
3147 | if (ValVT != MemVT) |
3148 | return SDValue(); |
3149 | |
3150 | const auto NumEltsAndEltVT = getVectorLoweringShape( |
3151 | VectorEVT: ValVT, CanLowerTo256Bit: STI.has256BitVectorLoadStore(AS: N->getAddressSpace())); |
3152 | if (!NumEltsAndEltVT) |
3153 | return SDValue(); |
3154 | const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); |
3155 | |
3156 | const DataLayout &TD = DAG.getDataLayout(); |
3157 | |
3158 | Align Alignment = N->getAlign(); |
3159 | Align PrefAlign = TD.getPrefTypeAlign(Ty: ValVT.getTypeForEVT(Context&: *DAG.getContext())); |
3160 | if (Alignment < PrefAlign) { |
3161 | // This store is not sufficiently aligned, so bail out and let this vector |
3162 | // store be scalarized. Note that we may still be able to emit smaller |
3163 | // vector stores. For example, if we are storing a <4 x float> with an |
3164 | // alignment of 8, this check will fail but the legalizer will try again |
3165 | // with 2 x <2 x float>, which will succeed with an alignment of 8. |
3166 | return SDValue(); |
3167 | } |
3168 | |
3169 | unsigned Opcode; |
3170 | switch (NumElts) { |
3171 | default: |
3172 | return SDValue(); |
3173 | case 2: |
3174 | Opcode = NVPTXISD::StoreV2; |
3175 | break; |
3176 | case 4: |
3177 | Opcode = NVPTXISD::StoreV4; |
3178 | break; |
3179 | case 8: |
3180 | Opcode = NVPTXISD::StoreV8; |
3181 | break; |
3182 | } |
3183 | |
3184 | SmallVector<SDValue, 8> Ops; |
3185 | |
3186 | // First is the chain |
3187 | Ops.push_back(Elt: N->getOperand(Num: 0)); |
3188 | |
3189 | // Then the split values |
3190 | if (EltVT.isVector()) { |
3191 | assert(EVT(EltVT.getVectorElementType()) == ValVT.getVectorElementType()); |
3192 | assert(NumElts * EltVT.getVectorNumElements() == |
3193 | ValVT.getVectorNumElements()); |
3194 | // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be |
3195 | // stored as b32s |
3196 | const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements(); |
3197 | for (const unsigned I : llvm::seq(Size: NumElts)) { |
3198 | SmallVector<SDValue, 4> SubVectorElts; |
3199 | DAG.ExtractVectorElements(Op: Val, Args&: SubVectorElts, Start: I * NumEltsPerSubVector, |
3200 | Count: NumEltsPerSubVector); |
3201 | Ops.push_back(Elt: DAG.getBuildVector(VT: EltVT, DL, Ops: SubVectorElts)); |
3202 | } |
3203 | } else { |
3204 | SDValue V = DAG.getBitcast(VT: MVT::getVectorVT(VT: EltVT, NumElements: NumElts), V: Val); |
3205 | for (const unsigned I : llvm::seq(Size: NumElts)) { |
3206 | SDValue ExtVal = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: V, |
3207 | N2: DAG.getIntPtrConstant(Val: I, DL)); |
3208 | |
3209 | // Since StoreV2 is a target node, we cannot rely on DAG type |
3210 | // legalization. Therefore, we must ensure the type is legal. For i1 and |
3211 | // i8, we set the stored type to i16 and propagate the "real" type as the |
3212 | // memory type. |
3213 | if (EltVT.getSizeInBits() < 16) |
3214 | ExtVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i16, Operand: ExtVal); |
3215 | Ops.push_back(Elt: ExtVal); |
3216 | } |
3217 | } |
3218 | |
3219 | // Then any remaining arguments |
3220 | Ops.append(in_start: N->op_begin() + 2, in_end: N->op_end()); |
3221 | |
3222 | SDValue NewSt = |
3223 | DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: DAG.getVTList(VT: MVT::Other), Ops, |
3224 | MemVT: N->getMemoryVT(), MMO: N->getMemOperand()); |
3225 | |
3226 | // return DCI.CombineTo(N, NewSt, true); |
3227 | return NewSt; |
3228 | } |
3229 | |
3230 | // st i1 v, addr |
3231 | // => |
3232 | // v1 = zxt v to i16 |
3233 | // st.u8 i16, addr |
3234 | SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { |
3235 | SDNode *Node = Op.getNode(); |
3236 | SDLoc dl(Node); |
3237 | StoreSDNode *ST = cast<StoreSDNode>(Val: Node); |
3238 | SDValue Tmp1 = ST->getChain(); |
3239 | SDValue Tmp2 = ST->getBasePtr(); |
3240 | SDValue Tmp3 = ST->getValue(); |
3241 | assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only" ); |
3242 | Tmp3 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MVT::i16, Operand: Tmp3); |
3243 | SDValue Result = |
3244 | DAG.getTruncStore(Chain: Tmp1, dl, Val: Tmp3, Ptr: Tmp2, PtrInfo: ST->getPointerInfo(), SVT: MVT::i8, |
3245 | Alignment: ST->getAlign(), MMOFlags: ST->getMemOperand()->getFlags()); |
3246 | return Result; |
3247 | } |
3248 | |
3249 | SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, |
3250 | SelectionDAG &DAG) const { |
3251 | // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit |
3252 | // operand so that it can pass the legalization. |
3253 | |
3254 | assert(Op.getOperand(1).getValueType() == MVT::i128 && |
3255 | "Custom lowering for 128-bit CopyToReg only" ); |
3256 | |
3257 | SDNode *Node = Op.getNode(); |
3258 | SDLoc DL(Node); |
3259 | |
3260 | SDValue Cast = DAG.getBitcast(VT: MVT::v2i64, V: Op->getOperand(Num: 2)); |
3261 | SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64, N1: Cast, |
3262 | N2: DAG.getIntPtrConstant(Val: 0, DL)); |
3263 | SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64, N1: Cast, |
3264 | N2: DAG.getIntPtrConstant(Val: 1, DL)); |
3265 | |
3266 | SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1); |
3267 | SmallVector<EVT, 3> ResultsType(Node->values()); |
3268 | |
3269 | NewOps[0] = Op->getOperand(Num: 0); // Chain |
3270 | NewOps[1] = Op->getOperand(Num: 1); // Dst Reg |
3271 | NewOps[2] = Lo; // Lower 64-bit |
3272 | NewOps[3] = Hi; // Higher 64-bit |
3273 | if (Op.getNumOperands() == 4) |
3274 | NewOps[4] = Op->getOperand(Num: 3); // Glue if exists |
3275 | |
3276 | return DAG.getNode(Opcode: ISD::CopyToReg, DL, ResultTys: ResultsType, Ops: NewOps); |
3277 | } |
3278 | |
3279 | unsigned NVPTXTargetLowering::getNumRegisters( |
3280 | LLVMContext &Context, EVT VT, |
3281 | std::optional<MVT> RegisterVT = std::nullopt) const { |
3282 | if (VT == MVT::i128 && RegisterVT == MVT::i128) |
3283 | return 1; |
3284 | return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT); |
3285 | } |
3286 | |
3287 | bool NVPTXTargetLowering::splitValueIntoRegisterParts( |
3288 | SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, |
3289 | unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { |
3290 | if (Val.getValueType() == MVT::i128 && NumParts == 1) { |
3291 | Parts[0] = Val; |
3292 | return true; |
3293 | } |
3294 | return false; |
3295 | } |
3296 | |
3297 | // This creates target external symbol for a function parameter. |
3298 | // Name of the symbol is composed from its index and the function name. |
3299 | // Negative index corresponds to special parameter (unsized array) used for |
3300 | // passing variable arguments. |
3301 | SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I, |
3302 | EVT T) const { |
3303 | StringRef SavedStr = nvTM->getStrPool().save( |
3304 | S: getParamName(F: &DAG.getMachineFunction().getFunction(), Idx: I)); |
3305 | return DAG.getExternalSymbol(Sym: SavedStr.data(), VT: T); |
3306 | } |
3307 | |
3308 | SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I, |
3309 | EVT T) const { |
3310 | const StringRef SavedStr = nvTM->getStrPool().save(S: "param" + Twine(I)); |
3311 | return DAG.getExternalSymbol(Sym: SavedStr.data(), VT: T); |
3312 | } |
3313 | |
3314 | SDValue NVPTXTargetLowering::LowerFormalArguments( |
3315 | SDValue Chain, CallingConv::ID CallConv, bool isVarArg, |
3316 | const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, |
3317 | SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
3318 | MachineFunction &MF = DAG.getMachineFunction(); |
3319 | const DataLayout &DL = DAG.getDataLayout(); |
3320 | auto PtrVT = getPointerTy(DL: DAG.getDataLayout()); |
3321 | |
3322 | const Function *F = &MF.getFunction(); |
3323 | |
3324 | SDValue Root = DAG.getRoot(); |
3325 | SmallVector<SDValue, 16> OutChains; |
3326 | |
3327 | // argTypes.size() (or theArgs.size()) and Ins.size() need not match. |
3328 | // Ins.size() will be larger |
3329 | // * if there is an aggregate argument with multiple fields (each field |
3330 | // showing up separately in Ins) |
3331 | // * if there is a vector argument with more than typical vector-length |
3332 | // elements (generally if more than 4) where each vector element is |
3333 | // individually present in Ins. |
3334 | // So a different index should be used for indexing into Ins. |
3335 | // See similar issue in LowerCall. |
3336 | |
3337 | auto AllIns = ArrayRef(Ins); |
3338 | for (const auto &Arg : F->args()) { |
3339 | const auto ArgIns = AllIns.take_while( |
3340 | Pred: [&](auto I) { return I.OrigArgIndex == Arg.getArgNo(); }); |
3341 | AllIns = AllIns.drop_front(N: ArgIns.size()); |
3342 | |
3343 | Type *Ty = Arg.getType(); |
3344 | |
3345 | if (ArgIns.empty()) |
3346 | report_fatal_error(reason: "Empty parameter types are not supported" ); |
3347 | |
3348 | if (Arg.use_empty()) { |
3349 | // argument is dead |
3350 | for (const auto &In : ArgIns) { |
3351 | assert(!In.Used && "Arg.use_empty() is true but Arg is used?" ); |
3352 | InVals.push_back(Elt: DAG.getUNDEF(VT: In.VT)); |
3353 | } |
3354 | continue; |
3355 | } |
3356 | |
3357 | SDValue ArgSymbol = getParamSymbol(DAG, I: Arg.getArgNo(), T: PtrVT); |
3358 | |
3359 | // In the following cases, assign a node order of "i+1" |
3360 | // to newly created nodes. The SDNodes for params have to |
3361 | // appear in the same order as their order of appearance |
3362 | // in the original function. "i+1" holds that order. |
3363 | if (Arg.hasByValAttr()) { |
3364 | // Param has ByVal attribute |
3365 | // Return MoveParam(param symbol). |
3366 | // Ideally, the param symbol can be returned directly, |
3367 | // but when SDNode builder decides to use it in a CopyToReg(), |
3368 | // machine instruction fails because TargetExternalSymbol |
3369 | // (not lowered) is target dependent, and CopyToReg assumes |
3370 | // the source is lowered. |
3371 | assert(ArgIns.size() == 1 && "ByVal argument must be a pointer" ); |
3372 | const auto &ByvalIn = ArgIns[0]; |
3373 | assert(getValueType(DL, Ty) == ByvalIn.VT && |
3374 | "Ins type did not match function type" ); |
3375 | assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer" ); |
3376 | |
3377 | SDValue P; |
3378 | if (isKernelFunction(F: *F)) { |
3379 | P = ArgSymbol; |
3380 | P.getNode()->setIROrder(Arg.getArgNo() + 1); |
3381 | } else { |
3382 | P = DAG.getNode(Opcode: NVPTXISD::MoveParam, DL: dl, VT: ByvalIn.VT, Operand: ArgSymbol); |
3383 | P.getNode()->setIROrder(Arg.getArgNo() + 1); |
3384 | P = DAG.getAddrSpaceCast(dl, VT: ByvalIn.VT, Ptr: P, SrcAS: ADDRESS_SPACE_LOCAL, |
3385 | DestAS: ADDRESS_SPACE_GENERIC); |
3386 | } |
3387 | InVals.push_back(Elt: P); |
3388 | } else { |
3389 | SmallVector<EVT, 16> VTs; |
3390 | SmallVector<uint64_t, 16> Offsets; |
3391 | ComputePTXValueVTs(TLI: *this, DL, Ty, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: 0); |
3392 | assert(VTs.size() == ArgIns.size() && "Size mismatch" ); |
3393 | assert(VTs.size() == Offsets.size() && "Size mismatch" ); |
3394 | |
3395 | const Align ArgAlign = getFunctionArgumentAlignment( |
3396 | F, Ty, Idx: Arg.getArgNo() + AttributeList::FirstArgIndex, DL); |
3397 | |
3398 | const auto VectorInfo = VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: ArgAlign); |
3399 | unsigned I = 0; |
3400 | for (const unsigned NumElts : VectorInfo) { |
3401 | // i1 is loaded/stored as i8 |
3402 | const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I]; |
3403 | // If the element is a packed type (ex. v2f16, v4i8, etc) holding |
3404 | // multiple elements. |
3405 | const unsigned PackingAmt = |
3406 | LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; |
3407 | |
3408 | const EVT VecVT = |
3409 | NumElts == 1 |
3410 | ? LoadVT |
3411 | : EVT::getVectorVT(Context&: F->getContext(), VT: LoadVT.getScalarType(), |
3412 | NumElements: NumElts * PackingAmt); |
3413 | |
3414 | SDValue VecAddr = DAG.getObjectPtrOffset( |
3415 | SL: dl, Ptr: ArgSymbol, Offset: TypeSize::getFixed(ExactSize: Offsets[I])); |
3416 | |
3417 | const MaybeAlign PartAlign = commonAlignment(A: ArgAlign, Offset: Offsets[I]); |
3418 | SDValue P = |
3419 | DAG.getLoad(VT: VecVT, dl, Chain: Root, Ptr: VecAddr, |
3420 | PtrInfo: MachinePointerInfo(ADDRESS_SPACE_PARAM), Alignment: PartAlign, |
3421 | MMOFlags: MachineMemOperand::MODereferenceable | |
3422 | MachineMemOperand::MOInvariant); |
3423 | if (P.getNode()) |
3424 | P.getNode()->setIROrder(Arg.getArgNo() + 1); |
3425 | for (const unsigned J : llvm::seq(Size: NumElts)) { |
3426 | SDValue Elt = |
3427 | NumElts == 1 |
3428 | ? P |
3429 | : DAG.getNode(Opcode: LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR |
3430 | : ISD::EXTRACT_VECTOR_ELT, |
3431 | DL: dl, VT: LoadVT, N1: P, |
3432 | N2: DAG.getVectorIdxConstant(Val: J * PackingAmt, DL: dl)); |
3433 | |
3434 | Elt = correctParamType(V: Elt, ExpectedVT: ArgIns[I + J].VT, Flags: ArgIns[I + J].Flags, |
3435 | DAG, dl); |
3436 | InVals.push_back(Elt); |
3437 | } |
3438 | I += NumElts; |
3439 | } |
3440 | } |
3441 | } |
3442 | |
3443 | if (!OutChains.empty()) |
3444 | DAG.setRoot(DAG.getTokenFactor(DL: dl, Vals&: OutChains)); |
3445 | |
3446 | return Chain; |
3447 | } |
3448 | |
3449 | SDValue |
3450 | NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
3451 | bool isVarArg, |
3452 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
3453 | const SmallVectorImpl<SDValue> &OutVals, |
3454 | const SDLoc &dl, SelectionDAG &DAG) const { |
3455 | const MachineFunction &MF = DAG.getMachineFunction(); |
3456 | const Function &F = MF.getFunction(); |
3457 | Type *RetTy = MF.getFunction().getReturnType(); |
3458 | |
3459 | if (RetTy->isVoidTy()) { |
3460 | assert(OutVals.empty() && Outs.empty() && "Return value expected for void" ); |
3461 | return DAG.getNode(Opcode: NVPTXISD::RET_GLUE, DL: dl, VT: MVT::Other, Operand: Chain); |
3462 | } |
3463 | |
3464 | const DataLayout &DL = DAG.getDataLayout(); |
3465 | SmallVector<EVT, 16> VTs; |
3466 | SmallVector<uint64_t, 16> Offsets; |
3467 | ComputePTXValueVTs(TLI: *this, DL, Ty: RetTy, ValueVTs&: VTs, Offsets: &Offsets); |
3468 | assert(VTs.size() == OutVals.size() && "Bad return value decomposition" ); |
3469 | |
3470 | // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than |
3471 | // 32-bits are sign extended or zero extended, depending on whether |
3472 | // they are signed or unsigned types. |
3473 | const bool ExtendIntegerRetVal = |
3474 | RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty: RetTy) < 32; |
3475 | |
3476 | const auto GetRetVal = [&](unsigned I) -> SDValue { |
3477 | SDValue RetVal = OutVals[I]; |
3478 | assert(promoteScalarIntegerPTX(RetVal.getValueType()) == |
3479 | RetVal.getValueType() && |
3480 | "OutVal type should always be legal" ); |
3481 | |
3482 | const EVT VTI = promoteScalarIntegerPTX(VT: VTs[I]); |
3483 | const EVT StoreVT = |
3484 | ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); |
3485 | return correctParamType(V: RetVal, ExpectedVT: StoreVT, Flags: Outs[I].Flags, DAG, dl); |
3486 | }; |
3487 | |
3488 | const auto RetAlign = getFunctionParamOptimizedAlign(F: &F, ArgTy: RetTy, DL); |
3489 | const auto VectorInfo = VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: RetAlign); |
3490 | unsigned I = 0; |
3491 | for (const unsigned NumElts : VectorInfo) { |
3492 | const MaybeAlign CurrentAlign = ExtendIntegerRetVal |
3493 | ? MaybeAlign(std::nullopt) |
3494 | : commonAlignment(A: RetAlign, Offset: Offsets[I]); |
3495 | |
3496 | SDValue Val; |
3497 | if (NumElts == 1) { |
3498 | Val = GetRetVal(I); |
3499 | } else { |
3500 | SmallVector<SDValue, 4> StoreVals; |
3501 | for (const unsigned J : llvm::seq(Size: NumElts)) { |
3502 | SDValue ValJ = GetRetVal(I + J); |
3503 | if (ValJ.getValueType().isVector()) |
3504 | DAG.ExtractVectorElements(Op: ValJ, Args&: StoreVals); |
3505 | else |
3506 | StoreVals.push_back(Elt: ValJ); |
3507 | } |
3508 | |
3509 | EVT VT = EVT::getVectorVT(Context&: F.getContext(), VT: StoreVals[0].getValueType(), |
3510 | NumElements: StoreVals.size()); |
3511 | Val = DAG.getBuildVector(VT, DL: dl, Ops: StoreVals); |
3512 | } |
3513 | |
3514 | const SDValue RetSymbol = DAG.getExternalSymbol(Sym: "func_retval0" , VT: MVT::i32); |
3515 | SDValue Ptr = |
3516 | DAG.getObjectPtrOffset(SL: dl, Ptr: RetSymbol, Offset: TypeSize::getFixed(ExactSize: Offsets[I])); |
3517 | |
3518 | Chain = DAG.getStore(Chain, dl, Val, Ptr, |
3519 | PtrInfo: MachinePointerInfo(ADDRESS_SPACE_PARAM), Alignment: CurrentAlign); |
3520 | |
3521 | I += NumElts; |
3522 | } |
3523 | |
3524 | return DAG.getNode(Opcode: NVPTXISD::RET_GLUE, DL: dl, VT: MVT::Other, Operand: Chain); |
3525 | } |
3526 | |
3527 | void NVPTXTargetLowering::LowerAsmOperandForConstraint( |
3528 | SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, |
3529 | SelectionDAG &DAG) const { |
3530 | if (Constraint.size() > 1) |
3531 | return; |
3532 | TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); |
3533 | } |
3534 | |
3535 | // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as |
3536 | // TgtMemIntrinsic |
3537 | // because we need the information that is only available in the "Value" type |
3538 | // of destination |
3539 | // pointer. In particular, the address space information. |
3540 | bool NVPTXTargetLowering::getTgtMemIntrinsic( |
3541 | IntrinsicInfo &Info, const CallInst &I, |
3542 | MachineFunction &MF, unsigned Intrinsic) const { |
3543 | switch (Intrinsic) { |
3544 | default: |
3545 | return false; |
3546 | case Intrinsic::nvvm_match_all_sync_i32p: |
3547 | case Intrinsic::nvvm_match_all_sync_i64p: |
3548 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3549 | // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute |
3550 | // in order to model data exchange with other threads, but perform no real |
3551 | // memory accesses. |
3552 | Info.memVT = MVT::i1; |
3553 | |
3554 | // Our result depends on both our and other thread's arguments. |
3555 | Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
3556 | return true; |
3557 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: |
3558 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: |
3559 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: |
3560 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: |
3561 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: |
3562 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: |
3563 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: |
3564 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: |
3565 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: |
3566 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: |
3567 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: |
3568 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: |
3569 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: |
3570 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: |
3571 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: |
3572 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: |
3573 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: |
3574 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: |
3575 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: |
3576 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: |
3577 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: |
3578 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: |
3579 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: |
3580 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { |
3581 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3582 | Info.memVT = MVT::v8f16; |
3583 | Info.ptrVal = I.getArgOperand(i: 0); |
3584 | Info.offset = 0; |
3585 | Info.flags = MachineMemOperand::MOLoad; |
3586 | Info.align = Align(16); |
3587 | return true; |
3588 | } |
3589 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: |
3590 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: |
3591 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: |
3592 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: |
3593 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: |
3594 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: |
3595 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: |
3596 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: |
3597 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: |
3598 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: |
3599 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: |
3600 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: |
3601 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: |
3602 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: |
3603 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: |
3604 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: |
3605 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: |
3606 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: |
3607 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: |
3608 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: |
3609 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: |
3610 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: |
3611 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: |
3612 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { |
3613 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3614 | Info.memVT = MVT::v2i32; |
3615 | Info.ptrVal = I.getArgOperand(i: 0); |
3616 | Info.offset = 0; |
3617 | Info.flags = MachineMemOperand::MOLoad; |
3618 | Info.align = Align(8); |
3619 | return true; |
3620 | } |
3621 | |
3622 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: |
3623 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: |
3624 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: |
3625 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: |
3626 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: |
3627 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: |
3628 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: |
3629 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: |
3630 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: |
3631 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: |
3632 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: |
3633 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: |
3634 | case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: |
3635 | case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: |
3636 | case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: |
3637 | case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: |
3638 | |
3639 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: |
3640 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: |
3641 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: |
3642 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: |
3643 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: |
3644 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: |
3645 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: |
3646 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: |
3647 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: |
3648 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: |
3649 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: |
3650 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: |
3651 | case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: |
3652 | case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: |
3653 | case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: |
3654 | case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: |
3655 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: |
3656 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: |
3657 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8: |
3658 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64: |
3659 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32: |
3660 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64: |
3661 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: { |
3662 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3663 | Info.memVT = MVT::v4i32; |
3664 | Info.ptrVal = I.getArgOperand(i: 0); |
3665 | Info.offset = 0; |
3666 | Info.flags = MachineMemOperand::MOLoad; |
3667 | Info.align = Align(16); |
3668 | return true; |
3669 | } |
3670 | |
3671 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: |
3672 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: |
3673 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: |
3674 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: |
3675 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: |
3676 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: |
3677 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: |
3678 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: |
3679 | |
3680 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: |
3681 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: |
3682 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: |
3683 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: |
3684 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: |
3685 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: |
3686 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: |
3687 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: |
3688 | case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: |
3689 | case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: |
3690 | case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: |
3691 | case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: |
3692 | case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: |
3693 | case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: |
3694 | case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: |
3695 | case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: |
3696 | case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: |
3697 | case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: |
3698 | case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: |
3699 | case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: |
3700 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: |
3701 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: |
3702 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64: |
3703 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: { |
3704 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3705 | Info.memVT = MVT::i32; |
3706 | Info.ptrVal = I.getArgOperand(i: 0); |
3707 | Info.offset = 0; |
3708 | Info.flags = MachineMemOperand::MOLoad; |
3709 | Info.align = Align(4); |
3710 | return true; |
3711 | } |
3712 | |
3713 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: |
3714 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: |
3715 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: |
3716 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: |
3717 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: |
3718 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: |
3719 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: |
3720 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: |
3721 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: |
3722 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: |
3723 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: |
3724 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { |
3725 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3726 | Info.memVT = MVT::v4f16; |
3727 | Info.ptrVal = I.getArgOperand(i: 0); |
3728 | Info.offset = 0; |
3729 | Info.flags = MachineMemOperand::MOLoad; |
3730 | Info.align = Align(16); |
3731 | return true; |
3732 | } |
3733 | |
3734 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: |
3735 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: |
3736 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: |
3737 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: |
3738 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: |
3739 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: |
3740 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: |
3741 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: |
3742 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: |
3743 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: |
3744 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: |
3745 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: |
3746 | case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: |
3747 | case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: |
3748 | case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: |
3749 | case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { |
3750 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3751 | Info.memVT = MVT::v8f32; |
3752 | Info.ptrVal = I.getArgOperand(i: 0); |
3753 | Info.offset = 0; |
3754 | Info.flags = MachineMemOperand::MOLoad; |
3755 | Info.align = Align(16); |
3756 | return true; |
3757 | } |
3758 | |
3759 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: |
3760 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: |
3761 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: |
3762 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: |
3763 | |
3764 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: |
3765 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: |
3766 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: |
3767 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: |
3768 | |
3769 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: |
3770 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: |
3771 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: |
3772 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: |
3773 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: |
3774 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: |
3775 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: |
3776 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: |
3777 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: |
3778 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: |
3779 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: |
3780 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { |
3781 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3782 | Info.memVT = MVT::v8i32; |
3783 | Info.ptrVal = I.getArgOperand(i: 0); |
3784 | Info.offset = 0; |
3785 | Info.flags = MachineMemOperand::MOLoad; |
3786 | Info.align = Align(16); |
3787 | return true; |
3788 | } |
3789 | |
3790 | case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: |
3791 | case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: |
3792 | case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: |
3793 | case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: |
3794 | case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: |
3795 | case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: |
3796 | case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: |
3797 | case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: |
3798 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: |
3799 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: |
3800 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8: |
3801 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64: |
3802 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32: |
3803 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64: |
3804 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: { |
3805 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3806 | Info.memVT = MVT::v2i32; |
3807 | Info.ptrVal = I.getArgOperand(i: 0); |
3808 | Info.offset = 0; |
3809 | Info.flags = MachineMemOperand::MOLoad; |
3810 | Info.align = Align(8); |
3811 | return true; |
3812 | } |
3813 | |
3814 | case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: |
3815 | case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: |
3816 | case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: |
3817 | case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: |
3818 | |
3819 | case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: |
3820 | case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: |
3821 | case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: |
3822 | case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { |
3823 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3824 | Info.memVT = MVT::f64; |
3825 | Info.ptrVal = I.getArgOperand(i: 0); |
3826 | Info.offset = 0; |
3827 | Info.flags = MachineMemOperand::MOLoad; |
3828 | Info.align = Align(8); |
3829 | return true; |
3830 | } |
3831 | |
3832 | case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: |
3833 | case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: |
3834 | case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: |
3835 | case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { |
3836 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3837 | Info.memVT = MVT::v2f64; |
3838 | Info.ptrVal = I.getArgOperand(i: 0); |
3839 | Info.offset = 0; |
3840 | Info.flags = MachineMemOperand::MOLoad; |
3841 | Info.align = Align(16); |
3842 | return true; |
3843 | } |
3844 | |
3845 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: |
3846 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: |
3847 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: |
3848 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: |
3849 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: |
3850 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: |
3851 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: |
3852 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: |
3853 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: |
3854 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: |
3855 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: |
3856 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { |
3857 | Info.opc = ISD::INTRINSIC_VOID; |
3858 | Info.memVT = MVT::v4f16; |
3859 | Info.ptrVal = I.getArgOperand(i: 0); |
3860 | Info.offset = 0; |
3861 | Info.flags = MachineMemOperand::MOStore; |
3862 | Info.align = Align(16); |
3863 | return true; |
3864 | } |
3865 | |
3866 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: |
3867 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: |
3868 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: |
3869 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: |
3870 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: |
3871 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: |
3872 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: |
3873 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: |
3874 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: |
3875 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: |
3876 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: |
3877 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: |
3878 | case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: |
3879 | case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: |
3880 | case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: |
3881 | case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { |
3882 | Info.opc = ISD::INTRINSIC_VOID; |
3883 | Info.memVT = MVT::v8f32; |
3884 | Info.ptrVal = I.getArgOperand(i: 0); |
3885 | Info.offset = 0; |
3886 | Info.flags = MachineMemOperand::MOStore; |
3887 | Info.align = Align(16); |
3888 | return true; |
3889 | } |
3890 | |
3891 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: |
3892 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: |
3893 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: |
3894 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: |
3895 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: |
3896 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: |
3897 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: |
3898 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: |
3899 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: |
3900 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: |
3901 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: |
3902 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { |
3903 | Info.opc = ISD::INTRINSIC_VOID; |
3904 | Info.memVT = MVT::v8i32; |
3905 | Info.ptrVal = I.getArgOperand(i: 0); |
3906 | Info.offset = 0; |
3907 | Info.flags = MachineMemOperand::MOStore; |
3908 | Info.align = Align(16); |
3909 | return true; |
3910 | } |
3911 | |
3912 | case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: |
3913 | case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: |
3914 | case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: |
3915 | case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: |
3916 | case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: |
3917 | case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: |
3918 | case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: |
3919 | case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { |
3920 | Info.opc = ISD::INTRINSIC_VOID; |
3921 | Info.memVT = MVT::v2i32; |
3922 | Info.ptrVal = I.getArgOperand(i: 0); |
3923 | Info.offset = 0; |
3924 | Info.flags = MachineMemOperand::MOStore; |
3925 | Info.align = Align(8); |
3926 | return true; |
3927 | } |
3928 | |
3929 | case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: |
3930 | case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: |
3931 | case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: |
3932 | case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { |
3933 | Info.opc = ISD::INTRINSIC_VOID; |
3934 | Info.memVT = MVT::v2f64; |
3935 | Info.ptrVal = I.getArgOperand(i: 0); |
3936 | Info.offset = 0; |
3937 | Info.flags = MachineMemOperand::MOStore; |
3938 | Info.align = Align(16); |
3939 | return true; |
3940 | } |
3941 | |
3942 | case Intrinsic::nvvm_atomic_add_gen_f_cta: |
3943 | case Intrinsic::nvvm_atomic_add_gen_f_sys: |
3944 | case Intrinsic::nvvm_atomic_add_gen_i_cta: |
3945 | case Intrinsic::nvvm_atomic_add_gen_i_sys: |
3946 | case Intrinsic::nvvm_atomic_and_gen_i_cta: |
3947 | case Intrinsic::nvvm_atomic_and_gen_i_sys: |
3948 | case Intrinsic::nvvm_atomic_cas_gen_i_cta: |
3949 | case Intrinsic::nvvm_atomic_cas_gen_i_sys: |
3950 | case Intrinsic::nvvm_atomic_dec_gen_i_cta: |
3951 | case Intrinsic::nvvm_atomic_dec_gen_i_sys: |
3952 | case Intrinsic::nvvm_atomic_inc_gen_i_cta: |
3953 | case Intrinsic::nvvm_atomic_inc_gen_i_sys: |
3954 | case Intrinsic::nvvm_atomic_max_gen_i_cta: |
3955 | case Intrinsic::nvvm_atomic_max_gen_i_sys: |
3956 | case Intrinsic::nvvm_atomic_min_gen_i_cta: |
3957 | case Intrinsic::nvvm_atomic_min_gen_i_sys: |
3958 | case Intrinsic::nvvm_atomic_or_gen_i_cta: |
3959 | case Intrinsic::nvvm_atomic_or_gen_i_sys: |
3960 | case Intrinsic::nvvm_atomic_exch_gen_i_cta: |
3961 | case Intrinsic::nvvm_atomic_exch_gen_i_sys: |
3962 | case Intrinsic::nvvm_atomic_xor_gen_i_cta: |
3963 | case Intrinsic::nvvm_atomic_xor_gen_i_sys: { |
3964 | auto &DL = I.getDataLayout(); |
3965 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3966 | Info.memVT = getValueType(DL, Ty: I.getType()); |
3967 | Info.ptrVal = I.getArgOperand(i: 0); |
3968 | Info.offset = 0; |
3969 | Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
3970 | Info.align.reset(); |
3971 | return true; |
3972 | } |
3973 | |
3974 | case Intrinsic::nvvm_ldu_global_i: |
3975 | case Intrinsic::nvvm_ldu_global_f: |
3976 | case Intrinsic::nvvm_ldu_global_p: { |
3977 | auto &DL = I.getDataLayout(); |
3978 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
3979 | if (Intrinsic == Intrinsic::nvvm_ldu_global_i) |
3980 | Info.memVT = getValueType(DL, Ty: I.getType()); |
3981 | else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) |
3982 | Info.memVT = getPointerTy(DL); |
3983 | else |
3984 | Info.memVT = getValueType(DL, Ty: I.getType()); |
3985 | Info.ptrVal = I.getArgOperand(i: 0); |
3986 | Info.offset = 0; |
3987 | Info.flags = MachineMemOperand::MOLoad; |
3988 | Info.align = cast<ConstantInt>(Val: I.getArgOperand(i: 1))->getMaybeAlignValue(); |
3989 | |
3990 | return true; |
3991 | } |
3992 | case Intrinsic::nvvm_tex_1d_v4f32_s32: |
3993 | case Intrinsic::nvvm_tex_1d_v4f32_f32: |
3994 | case Intrinsic::nvvm_tex_1d_level_v4f32_f32: |
3995 | case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: |
3996 | case Intrinsic::nvvm_tex_1d_array_v4f32_s32: |
3997 | case Intrinsic::nvvm_tex_1d_array_v4f32_f32: |
3998 | case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: |
3999 | case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: |
4000 | case Intrinsic::nvvm_tex_2d_v4f32_s32: |
4001 | case Intrinsic::nvvm_tex_2d_v4f32_f32: |
4002 | case Intrinsic::nvvm_tex_2d_level_v4f32_f32: |
4003 | case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: |
4004 | case Intrinsic::nvvm_tex_2d_array_v4f32_s32: |
4005 | case Intrinsic::nvvm_tex_2d_array_v4f32_f32: |
4006 | case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: |
4007 | case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: |
4008 | case Intrinsic::nvvm_tex_3d_v4f32_s32: |
4009 | case Intrinsic::nvvm_tex_3d_v4f32_f32: |
4010 | case Intrinsic::nvvm_tex_3d_level_v4f32_f32: |
4011 | case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: |
4012 | case Intrinsic::nvvm_tex_cube_v4f32_f32: |
4013 | case Intrinsic::nvvm_tex_cube_level_v4f32_f32: |
4014 | case Intrinsic::nvvm_tex_cube_array_v4f32_f32: |
4015 | case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: |
4016 | case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: |
4017 | case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: |
4018 | case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: |
4019 | case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: |
4020 | case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: |
4021 | case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: |
4022 | case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: |
4023 | case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: |
4024 | case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: |
4025 | case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: |
4026 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: |
4027 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: |
4028 | case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: |
4029 | case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: |
4030 | case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: |
4031 | case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: |
4032 | case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: |
4033 | case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: |
4034 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: |
4035 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: |
4036 | case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: |
4037 | case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: |
4038 | case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: |
4039 | case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: |
4040 | case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: |
4041 | case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: |
4042 | case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: |
4043 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: |
4044 | case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: |
4045 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: |
4046 | case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: |
4047 | case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: |
4048 | case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: |
4049 | case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: |
4050 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4051 | Info.memVT = MVT::v4f32; |
4052 | Info.ptrVal = nullptr; |
4053 | Info.offset = 0; |
4054 | Info.flags = MachineMemOperand::MOLoad; |
4055 | Info.align = Align(16); |
4056 | return true; |
4057 | |
4058 | case Intrinsic::nvvm_tex_1d_v4s32_s32: |
4059 | case Intrinsic::nvvm_tex_1d_v4s32_f32: |
4060 | case Intrinsic::nvvm_tex_1d_level_v4s32_f32: |
4061 | case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: |
4062 | case Intrinsic::nvvm_tex_1d_array_v4s32_s32: |
4063 | case Intrinsic::nvvm_tex_1d_array_v4s32_f32: |
4064 | case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: |
4065 | case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: |
4066 | case Intrinsic::nvvm_tex_2d_v4s32_s32: |
4067 | case Intrinsic::nvvm_tex_2d_v4s32_f32: |
4068 | case Intrinsic::nvvm_tex_2d_level_v4s32_f32: |
4069 | case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: |
4070 | case Intrinsic::nvvm_tex_2d_array_v4s32_s32: |
4071 | case Intrinsic::nvvm_tex_2d_array_v4s32_f32: |
4072 | case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: |
4073 | case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: |
4074 | case Intrinsic::nvvm_tex_3d_v4s32_s32: |
4075 | case Intrinsic::nvvm_tex_3d_v4s32_f32: |
4076 | case Intrinsic::nvvm_tex_3d_level_v4s32_f32: |
4077 | case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: |
4078 | case Intrinsic::nvvm_tex_cube_v4s32_f32: |
4079 | case Intrinsic::nvvm_tex_cube_level_v4s32_f32: |
4080 | case Intrinsic::nvvm_tex_cube_array_v4s32_f32: |
4081 | case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: |
4082 | case Intrinsic::nvvm_tex_cube_v4u32_f32: |
4083 | case Intrinsic::nvvm_tex_cube_level_v4u32_f32: |
4084 | case Intrinsic::nvvm_tex_cube_array_v4u32_f32: |
4085 | case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: |
4086 | case Intrinsic::nvvm_tex_1d_v4u32_s32: |
4087 | case Intrinsic::nvvm_tex_1d_v4u32_f32: |
4088 | case Intrinsic::nvvm_tex_1d_level_v4u32_f32: |
4089 | case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: |
4090 | case Intrinsic::nvvm_tex_1d_array_v4u32_s32: |
4091 | case Intrinsic::nvvm_tex_1d_array_v4u32_f32: |
4092 | case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: |
4093 | case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: |
4094 | case Intrinsic::nvvm_tex_2d_v4u32_s32: |
4095 | case Intrinsic::nvvm_tex_2d_v4u32_f32: |
4096 | case Intrinsic::nvvm_tex_2d_level_v4u32_f32: |
4097 | case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: |
4098 | case Intrinsic::nvvm_tex_2d_array_v4u32_s32: |
4099 | case Intrinsic::nvvm_tex_2d_array_v4u32_f32: |
4100 | case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: |
4101 | case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: |
4102 | case Intrinsic::nvvm_tex_3d_v4u32_s32: |
4103 | case Intrinsic::nvvm_tex_3d_v4u32_f32: |
4104 | case Intrinsic::nvvm_tex_3d_level_v4u32_f32: |
4105 | case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: |
4106 | case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: |
4107 | case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: |
4108 | case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: |
4109 | case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: |
4110 | case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: |
4111 | case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: |
4112 | case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: |
4113 | case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: |
4114 | case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: |
4115 | case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: |
4116 | case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: |
4117 | case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: |
4118 | case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: |
4119 | case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: |
4120 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: |
4121 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: |
4122 | case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: |
4123 | case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: |
4124 | case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: |
4125 | case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: |
4126 | case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: |
4127 | case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: |
4128 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: |
4129 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: |
4130 | case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: |
4131 | case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: |
4132 | case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: |
4133 | case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: |
4134 | case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: |
4135 | case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: |
4136 | case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: |
4137 | case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: |
4138 | case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: |
4139 | case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: |
4140 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: |
4141 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: |
4142 | case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: |
4143 | case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: |
4144 | case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: |
4145 | case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: |
4146 | case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: |
4147 | case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: |
4148 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: |
4149 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: |
4150 | case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: |
4151 | case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: |
4152 | case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: |
4153 | case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: |
4154 | case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: |
4155 | case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: |
4156 | case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: |
4157 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: |
4158 | case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: |
4159 | case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: |
4160 | case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: |
4161 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: |
4162 | case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: |
4163 | case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: |
4164 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: |
4165 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: |
4166 | case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: |
4167 | case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: |
4168 | case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: |
4169 | case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: |
4170 | case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: |
4171 | case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: |
4172 | case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: |
4173 | case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: |
4174 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4175 | Info.memVT = MVT::v4i32; |
4176 | Info.ptrVal = nullptr; |
4177 | Info.offset = 0; |
4178 | Info.flags = MachineMemOperand::MOLoad; |
4179 | Info.align = Align(16); |
4180 | return true; |
4181 | |
4182 | case Intrinsic::nvvm_suld_1d_i8_clamp: |
4183 | case Intrinsic::nvvm_suld_1d_v2i8_clamp: |
4184 | case Intrinsic::nvvm_suld_1d_v4i8_clamp: |
4185 | case Intrinsic::nvvm_suld_1d_array_i8_clamp: |
4186 | case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: |
4187 | case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: |
4188 | case Intrinsic::nvvm_suld_2d_i8_clamp: |
4189 | case Intrinsic::nvvm_suld_2d_v2i8_clamp: |
4190 | case Intrinsic::nvvm_suld_2d_v4i8_clamp: |
4191 | case Intrinsic::nvvm_suld_2d_array_i8_clamp: |
4192 | case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: |
4193 | case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: |
4194 | case Intrinsic::nvvm_suld_3d_i8_clamp: |
4195 | case Intrinsic::nvvm_suld_3d_v2i8_clamp: |
4196 | case Intrinsic::nvvm_suld_3d_v4i8_clamp: |
4197 | case Intrinsic::nvvm_suld_1d_i8_trap: |
4198 | case Intrinsic::nvvm_suld_1d_v2i8_trap: |
4199 | case Intrinsic::nvvm_suld_1d_v4i8_trap: |
4200 | case Intrinsic::nvvm_suld_1d_array_i8_trap: |
4201 | case Intrinsic::nvvm_suld_1d_array_v2i8_trap: |
4202 | case Intrinsic::nvvm_suld_1d_array_v4i8_trap: |
4203 | case Intrinsic::nvvm_suld_2d_i8_trap: |
4204 | case Intrinsic::nvvm_suld_2d_v2i8_trap: |
4205 | case Intrinsic::nvvm_suld_2d_v4i8_trap: |
4206 | case Intrinsic::nvvm_suld_2d_array_i8_trap: |
4207 | case Intrinsic::nvvm_suld_2d_array_v2i8_trap: |
4208 | case Intrinsic::nvvm_suld_2d_array_v4i8_trap: |
4209 | case Intrinsic::nvvm_suld_3d_i8_trap: |
4210 | case Intrinsic::nvvm_suld_3d_v2i8_trap: |
4211 | case Intrinsic::nvvm_suld_3d_v4i8_trap: |
4212 | case Intrinsic::nvvm_suld_1d_i8_zero: |
4213 | case Intrinsic::nvvm_suld_1d_v2i8_zero: |
4214 | case Intrinsic::nvvm_suld_1d_v4i8_zero: |
4215 | case Intrinsic::nvvm_suld_1d_array_i8_zero: |
4216 | case Intrinsic::nvvm_suld_1d_array_v2i8_zero: |
4217 | case Intrinsic::nvvm_suld_1d_array_v4i8_zero: |
4218 | case Intrinsic::nvvm_suld_2d_i8_zero: |
4219 | case Intrinsic::nvvm_suld_2d_v2i8_zero: |
4220 | case Intrinsic::nvvm_suld_2d_v4i8_zero: |
4221 | case Intrinsic::nvvm_suld_2d_array_i8_zero: |
4222 | case Intrinsic::nvvm_suld_2d_array_v2i8_zero: |
4223 | case Intrinsic::nvvm_suld_2d_array_v4i8_zero: |
4224 | case Intrinsic::nvvm_suld_3d_i8_zero: |
4225 | case Intrinsic::nvvm_suld_3d_v2i8_zero: |
4226 | case Intrinsic::nvvm_suld_3d_v4i8_zero: |
4227 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4228 | Info.memVT = MVT::i8; |
4229 | Info.ptrVal = nullptr; |
4230 | Info.offset = 0; |
4231 | Info.flags = MachineMemOperand::MOLoad; |
4232 | Info.align = Align(16); |
4233 | return true; |
4234 | |
4235 | case Intrinsic::nvvm_suld_1d_i16_clamp: |
4236 | case Intrinsic::nvvm_suld_1d_v2i16_clamp: |
4237 | case Intrinsic::nvvm_suld_1d_v4i16_clamp: |
4238 | case Intrinsic::nvvm_suld_1d_array_i16_clamp: |
4239 | case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: |
4240 | case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: |
4241 | case Intrinsic::nvvm_suld_2d_i16_clamp: |
4242 | case Intrinsic::nvvm_suld_2d_v2i16_clamp: |
4243 | case Intrinsic::nvvm_suld_2d_v4i16_clamp: |
4244 | case Intrinsic::nvvm_suld_2d_array_i16_clamp: |
4245 | case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: |
4246 | case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: |
4247 | case Intrinsic::nvvm_suld_3d_i16_clamp: |
4248 | case Intrinsic::nvvm_suld_3d_v2i16_clamp: |
4249 | case Intrinsic::nvvm_suld_3d_v4i16_clamp: |
4250 | case Intrinsic::nvvm_suld_1d_i16_trap: |
4251 | case Intrinsic::nvvm_suld_1d_v2i16_trap: |
4252 | case Intrinsic::nvvm_suld_1d_v4i16_trap: |
4253 | case Intrinsic::nvvm_suld_1d_array_i16_trap: |
4254 | case Intrinsic::nvvm_suld_1d_array_v2i16_trap: |
4255 | case Intrinsic::nvvm_suld_1d_array_v4i16_trap: |
4256 | case Intrinsic::nvvm_suld_2d_i16_trap: |
4257 | case Intrinsic::nvvm_suld_2d_v2i16_trap: |
4258 | case Intrinsic::nvvm_suld_2d_v4i16_trap: |
4259 | case Intrinsic::nvvm_suld_2d_array_i16_trap: |
4260 | case Intrinsic::nvvm_suld_2d_array_v2i16_trap: |
4261 | case Intrinsic::nvvm_suld_2d_array_v4i16_trap: |
4262 | case Intrinsic::nvvm_suld_3d_i16_trap: |
4263 | case Intrinsic::nvvm_suld_3d_v2i16_trap: |
4264 | case Intrinsic::nvvm_suld_3d_v4i16_trap: |
4265 | case Intrinsic::nvvm_suld_1d_i16_zero: |
4266 | case Intrinsic::nvvm_suld_1d_v2i16_zero: |
4267 | case Intrinsic::nvvm_suld_1d_v4i16_zero: |
4268 | case Intrinsic::nvvm_suld_1d_array_i16_zero: |
4269 | case Intrinsic::nvvm_suld_1d_array_v2i16_zero: |
4270 | case Intrinsic::nvvm_suld_1d_array_v4i16_zero: |
4271 | case Intrinsic::nvvm_suld_2d_i16_zero: |
4272 | case Intrinsic::nvvm_suld_2d_v2i16_zero: |
4273 | case Intrinsic::nvvm_suld_2d_v4i16_zero: |
4274 | case Intrinsic::nvvm_suld_2d_array_i16_zero: |
4275 | case Intrinsic::nvvm_suld_2d_array_v2i16_zero: |
4276 | case Intrinsic::nvvm_suld_2d_array_v4i16_zero: |
4277 | case Intrinsic::nvvm_suld_3d_i16_zero: |
4278 | case Intrinsic::nvvm_suld_3d_v2i16_zero: |
4279 | case Intrinsic::nvvm_suld_3d_v4i16_zero: |
4280 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4281 | Info.memVT = MVT::i16; |
4282 | Info.ptrVal = nullptr; |
4283 | Info.offset = 0; |
4284 | Info.flags = MachineMemOperand::MOLoad; |
4285 | Info.align = Align(16); |
4286 | return true; |
4287 | |
4288 | case Intrinsic::nvvm_suld_1d_i32_clamp: |
4289 | case Intrinsic::nvvm_suld_1d_v2i32_clamp: |
4290 | case Intrinsic::nvvm_suld_1d_v4i32_clamp: |
4291 | case Intrinsic::nvvm_suld_1d_array_i32_clamp: |
4292 | case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: |
4293 | case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: |
4294 | case Intrinsic::nvvm_suld_2d_i32_clamp: |
4295 | case Intrinsic::nvvm_suld_2d_v2i32_clamp: |
4296 | case Intrinsic::nvvm_suld_2d_v4i32_clamp: |
4297 | case Intrinsic::nvvm_suld_2d_array_i32_clamp: |
4298 | case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: |
4299 | case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: |
4300 | case Intrinsic::nvvm_suld_3d_i32_clamp: |
4301 | case Intrinsic::nvvm_suld_3d_v2i32_clamp: |
4302 | case Intrinsic::nvvm_suld_3d_v4i32_clamp: |
4303 | case Intrinsic::nvvm_suld_1d_i32_trap: |
4304 | case Intrinsic::nvvm_suld_1d_v2i32_trap: |
4305 | case Intrinsic::nvvm_suld_1d_v4i32_trap: |
4306 | case Intrinsic::nvvm_suld_1d_array_i32_trap: |
4307 | case Intrinsic::nvvm_suld_1d_array_v2i32_trap: |
4308 | case Intrinsic::nvvm_suld_1d_array_v4i32_trap: |
4309 | case Intrinsic::nvvm_suld_2d_i32_trap: |
4310 | case Intrinsic::nvvm_suld_2d_v2i32_trap: |
4311 | case Intrinsic::nvvm_suld_2d_v4i32_trap: |
4312 | case Intrinsic::nvvm_suld_2d_array_i32_trap: |
4313 | case Intrinsic::nvvm_suld_2d_array_v2i32_trap: |
4314 | case Intrinsic::nvvm_suld_2d_array_v4i32_trap: |
4315 | case Intrinsic::nvvm_suld_3d_i32_trap: |
4316 | case Intrinsic::nvvm_suld_3d_v2i32_trap: |
4317 | case Intrinsic::nvvm_suld_3d_v4i32_trap: |
4318 | case Intrinsic::nvvm_suld_1d_i32_zero: |
4319 | case Intrinsic::nvvm_suld_1d_v2i32_zero: |
4320 | case Intrinsic::nvvm_suld_1d_v4i32_zero: |
4321 | case Intrinsic::nvvm_suld_1d_array_i32_zero: |
4322 | case Intrinsic::nvvm_suld_1d_array_v2i32_zero: |
4323 | case Intrinsic::nvvm_suld_1d_array_v4i32_zero: |
4324 | case Intrinsic::nvvm_suld_2d_i32_zero: |
4325 | case Intrinsic::nvvm_suld_2d_v2i32_zero: |
4326 | case Intrinsic::nvvm_suld_2d_v4i32_zero: |
4327 | case Intrinsic::nvvm_suld_2d_array_i32_zero: |
4328 | case Intrinsic::nvvm_suld_2d_array_v2i32_zero: |
4329 | case Intrinsic::nvvm_suld_2d_array_v4i32_zero: |
4330 | case Intrinsic::nvvm_suld_3d_i32_zero: |
4331 | case Intrinsic::nvvm_suld_3d_v2i32_zero: |
4332 | case Intrinsic::nvvm_suld_3d_v4i32_zero: |
4333 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4334 | Info.memVT = MVT::i32; |
4335 | Info.ptrVal = nullptr; |
4336 | Info.offset = 0; |
4337 | Info.flags = MachineMemOperand::MOLoad; |
4338 | Info.align = Align(16); |
4339 | return true; |
4340 | |
4341 | case Intrinsic::nvvm_suld_1d_i64_clamp: |
4342 | case Intrinsic::nvvm_suld_1d_v2i64_clamp: |
4343 | case Intrinsic::nvvm_suld_1d_array_i64_clamp: |
4344 | case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: |
4345 | case Intrinsic::nvvm_suld_2d_i64_clamp: |
4346 | case Intrinsic::nvvm_suld_2d_v2i64_clamp: |
4347 | case Intrinsic::nvvm_suld_2d_array_i64_clamp: |
4348 | case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: |
4349 | case Intrinsic::nvvm_suld_3d_i64_clamp: |
4350 | case Intrinsic::nvvm_suld_3d_v2i64_clamp: |
4351 | case Intrinsic::nvvm_suld_1d_i64_trap: |
4352 | case Intrinsic::nvvm_suld_1d_v2i64_trap: |
4353 | case Intrinsic::nvvm_suld_1d_array_i64_trap: |
4354 | case Intrinsic::nvvm_suld_1d_array_v2i64_trap: |
4355 | case Intrinsic::nvvm_suld_2d_i64_trap: |
4356 | case Intrinsic::nvvm_suld_2d_v2i64_trap: |
4357 | case Intrinsic::nvvm_suld_2d_array_i64_trap: |
4358 | case Intrinsic::nvvm_suld_2d_array_v2i64_trap: |
4359 | case Intrinsic::nvvm_suld_3d_i64_trap: |
4360 | case Intrinsic::nvvm_suld_3d_v2i64_trap: |
4361 | case Intrinsic::nvvm_suld_1d_i64_zero: |
4362 | case Intrinsic::nvvm_suld_1d_v2i64_zero: |
4363 | case Intrinsic::nvvm_suld_1d_array_i64_zero: |
4364 | case Intrinsic::nvvm_suld_1d_array_v2i64_zero: |
4365 | case Intrinsic::nvvm_suld_2d_i64_zero: |
4366 | case Intrinsic::nvvm_suld_2d_v2i64_zero: |
4367 | case Intrinsic::nvvm_suld_2d_array_i64_zero: |
4368 | case Intrinsic::nvvm_suld_2d_array_v2i64_zero: |
4369 | case Intrinsic::nvvm_suld_3d_i64_zero: |
4370 | case Intrinsic::nvvm_suld_3d_v2i64_zero: |
4371 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4372 | Info.memVT = MVT::i64; |
4373 | Info.ptrVal = nullptr; |
4374 | Info.offset = 0; |
4375 | Info.flags = MachineMemOperand::MOLoad; |
4376 | Info.align = Align(16); |
4377 | return true; |
4378 | |
4379 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x1: |
4380 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x1: |
4381 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: { |
4382 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4383 | Info.memVT = MVT::v1i32; |
4384 | Info.ptrVal = I.getArgOperand(i: 0); |
4385 | Info.offset = 0; |
4386 | Info.flags = MachineMemOperand::MOLoad; |
4387 | Info.align.reset(); |
4388 | return true; |
4389 | } |
4390 | |
4391 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: |
4392 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x1: |
4393 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x2: |
4394 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: { |
4395 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4396 | Info.memVT = MVT::v2i32; |
4397 | Info.ptrVal = I.getArgOperand(i: 0); |
4398 | Info.offset = 0; |
4399 | Info.flags = MachineMemOperand::MOLoad; |
4400 | Info.align.reset(); |
4401 | return true; |
4402 | } |
4403 | |
4404 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x4: |
4405 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x2: |
4406 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x4: |
4407 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x1: |
4408 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: { |
4409 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4410 | Info.memVT = MVT::v4i32; |
4411 | Info.ptrVal = I.getArgOperand(i: 0); |
4412 | Info.offset = 0; |
4413 | Info.flags = MachineMemOperand::MOLoad; |
4414 | Info.align.reset(); |
4415 | return true; |
4416 | } |
4417 | |
4418 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x8: |
4419 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x4: |
4420 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x2: |
4421 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x8: |
4422 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: { |
4423 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4424 | Info.memVT = MVT::v8i32; |
4425 | Info.ptrVal = I.getArgOperand(i: 0); |
4426 | Info.offset = 0; |
4427 | Info.flags = MachineMemOperand::MOLoad; |
4428 | Info.align.reset(); |
4429 | return true; |
4430 | } |
4431 | |
4432 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x16: |
4433 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x8: |
4434 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x4: |
4435 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x16: |
4436 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: { |
4437 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4438 | Info.memVT = MVT::v16i32; |
4439 | Info.ptrVal = I.getArgOperand(i: 0); |
4440 | Info.offset = 0; |
4441 | Info.flags = MachineMemOperand::MOLoad; |
4442 | Info.align.reset(); |
4443 | return true; |
4444 | } |
4445 | |
4446 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x32: |
4447 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x16: |
4448 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x8: |
4449 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x32: |
4450 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: { |
4451 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4452 | Info.memVT = MVT::v32i32; |
4453 | Info.ptrVal = I.getArgOperand(i: 0); |
4454 | Info.offset = 0; |
4455 | Info.flags = MachineMemOperand::MOLoad; |
4456 | Info.align.reset(); |
4457 | return true; |
4458 | } |
4459 | |
4460 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x64: |
4461 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x32: |
4462 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x16: |
4463 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x64: |
4464 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: { |
4465 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4466 | Info.memVT = MVT::v64i32; |
4467 | Info.ptrVal = I.getArgOperand(i: 0); |
4468 | Info.offset = 0; |
4469 | Info.flags = MachineMemOperand::MOLoad; |
4470 | Info.align.reset(); |
4471 | return true; |
4472 | } |
4473 | |
4474 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x128: |
4475 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x64: |
4476 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x32: |
4477 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: |
4478 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: { |
4479 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4480 | Info.memVT = MVT::v128i32; |
4481 | Info.ptrVal = I.getArgOperand(i: 0); |
4482 | Info.offset = 0; |
4483 | Info.flags = MachineMemOperand::MOLoad; |
4484 | Info.align.reset(); |
4485 | return true; |
4486 | } |
4487 | |
4488 | case Intrinsic::nvvm_tcgen05_st_16x64b_x1: |
4489 | case Intrinsic::nvvm_tcgen05_st_32x32b_x1: |
4490 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: { |
4491 | Info.opc = ISD::INTRINSIC_VOID; |
4492 | Info.memVT = MVT::i32; |
4493 | Info.ptrVal = I.getArgOperand(i: 0); |
4494 | Info.offset = 0; |
4495 | Info.flags = MachineMemOperand::MOStore; |
4496 | Info.align.reset(); |
4497 | return true; |
4498 | } |
4499 | |
4500 | case Intrinsic::nvvm_tcgen05_st_16x64b_x2: |
4501 | case Intrinsic::nvvm_tcgen05_st_16x128b_x1: |
4502 | case Intrinsic::nvvm_tcgen05_st_32x32b_x2: |
4503 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: { |
4504 | Info.opc = ISD::INTRINSIC_VOID; |
4505 | Info.memVT = MVT::v2i32; |
4506 | Info.ptrVal = I.getArgOperand(i: 0); |
4507 | Info.offset = 0; |
4508 | Info.flags = MachineMemOperand::MOStore; |
4509 | Info.align.reset(); |
4510 | return true; |
4511 | } |
4512 | |
4513 | case Intrinsic::nvvm_tcgen05_st_16x64b_x4: |
4514 | case Intrinsic::nvvm_tcgen05_st_16x128b_x2: |
4515 | case Intrinsic::nvvm_tcgen05_st_16x256b_x1: |
4516 | case Intrinsic::nvvm_tcgen05_st_32x32b_x4: |
4517 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: { |
4518 | Info.opc = ISD::INTRINSIC_VOID; |
4519 | Info.memVT = MVT::v4i32; |
4520 | Info.ptrVal = I.getArgOperand(i: 0); |
4521 | Info.offset = 0; |
4522 | Info.flags = MachineMemOperand::MOStore; |
4523 | Info.align.reset(); |
4524 | return true; |
4525 | } |
4526 | |
4527 | case Intrinsic::nvvm_tcgen05_st_16x64b_x8: |
4528 | case Intrinsic::nvvm_tcgen05_st_16x128b_x4: |
4529 | case Intrinsic::nvvm_tcgen05_st_16x256b_x2: |
4530 | case Intrinsic::nvvm_tcgen05_st_32x32b_x8: |
4531 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: { |
4532 | Info.opc = ISD::INTRINSIC_VOID; |
4533 | Info.memVT = MVT::v8i32; |
4534 | Info.ptrVal = I.getArgOperand(i: 0); |
4535 | Info.offset = 0; |
4536 | Info.flags = MachineMemOperand::MOStore; |
4537 | Info.align.reset(); |
4538 | return true; |
4539 | } |
4540 | |
4541 | case Intrinsic::nvvm_tcgen05_st_16x64b_x16: |
4542 | case Intrinsic::nvvm_tcgen05_st_16x128b_x8: |
4543 | case Intrinsic::nvvm_tcgen05_st_16x256b_x4: |
4544 | case Intrinsic::nvvm_tcgen05_st_32x32b_x16: |
4545 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: { |
4546 | Info.opc = ISD::INTRINSIC_VOID; |
4547 | Info.memVT = MVT::v16i32; |
4548 | Info.ptrVal = I.getArgOperand(i: 0); |
4549 | Info.offset = 0; |
4550 | Info.flags = MachineMemOperand::MOStore; |
4551 | Info.align.reset(); |
4552 | return true; |
4553 | } |
4554 | |
4555 | case Intrinsic::nvvm_tcgen05_st_16x64b_x32: |
4556 | case Intrinsic::nvvm_tcgen05_st_16x128b_x16: |
4557 | case Intrinsic::nvvm_tcgen05_st_16x256b_x8: |
4558 | case Intrinsic::nvvm_tcgen05_st_32x32b_x32: |
4559 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: { |
4560 | Info.opc = ISD::INTRINSIC_VOID; |
4561 | Info.memVT = MVT::v32i32; |
4562 | Info.ptrVal = I.getArgOperand(i: 0); |
4563 | Info.offset = 0; |
4564 | Info.flags = MachineMemOperand::MOStore; |
4565 | Info.align.reset(); |
4566 | return true; |
4567 | } |
4568 | |
4569 | case Intrinsic::nvvm_tcgen05_st_16x64b_x64: |
4570 | case Intrinsic::nvvm_tcgen05_st_16x128b_x32: |
4571 | case Intrinsic::nvvm_tcgen05_st_16x256b_x16: |
4572 | case Intrinsic::nvvm_tcgen05_st_32x32b_x64: |
4573 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: { |
4574 | Info.opc = ISD::INTRINSIC_VOID; |
4575 | Info.memVT = MVT::v64i32; |
4576 | Info.ptrVal = I.getArgOperand(i: 0); |
4577 | Info.offset = 0; |
4578 | Info.flags = MachineMemOperand::MOStore; |
4579 | Info.align.reset(); |
4580 | return true; |
4581 | } |
4582 | |
4583 | case Intrinsic::nvvm_tcgen05_st_16x64b_x128: |
4584 | case Intrinsic::nvvm_tcgen05_st_16x128b_x64: |
4585 | case Intrinsic::nvvm_tcgen05_st_16x256b_x32: |
4586 | case Intrinsic::nvvm_tcgen05_st_32x32b_x128: |
4587 | case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: { |
4588 | Info.opc = ISD::INTRINSIC_VOID; |
4589 | Info.memVT = MVT::v128i32; |
4590 | Info.ptrVal = I.getArgOperand(i: 0); |
4591 | Info.offset = 0; |
4592 | Info.flags = MachineMemOperand::MOStore; |
4593 | Info.align.reset(); |
4594 | return true; |
4595 | } |
4596 | } |
4597 | return false; |
4598 | } |
4599 | |
4600 | /// getFunctionParamOptimizedAlign - since function arguments are passed via |
4601 | /// .param space, we may want to increase their alignment in a way that |
4602 | /// ensures that we can effectively vectorize their loads & stores. We can |
4603 | /// increase alignment only if the function has internal or has private |
4604 | /// linkage as for other linkage types callers may already rely on default |
4605 | /// alignment. To allow using 128-bit vectorized loads/stores, this function |
4606 | /// ensures that alignment is 16 or greater. |
4607 | Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( |
4608 | const Function *F, Type *ArgTy, const DataLayout &DL) const { |
4609 | // Capping the alignment to 128 bytes as that is the maximum alignment |
4610 | // supported by PTX. |
4611 | const Align ABITypeAlign = std::min(a: Align(128), b: DL.getABITypeAlign(Ty: ArgTy)); |
4612 | |
4613 | // If a function has linkage different from internal or private, we |
4614 | // must use default ABI alignment as external users rely on it. Same |
4615 | // for a function that may be called from a function pointer. |
4616 | if (!F || !F->hasLocalLinkage() || |
4617 | F->hasAddressTaken(/*Users=*/nullptr, |
4618 | /*IgnoreCallbackUses=*/false, |
4619 | /*IgnoreAssumeLikeCalls=*/true, |
4620 | /*IgnoreLLVMUsed=*/IngoreLLVMUsed: true)) |
4621 | return ABITypeAlign; |
4622 | |
4623 | assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage" ); |
4624 | return std::max(a: Align(16), b: ABITypeAlign); |
4625 | } |
4626 | |
4627 | /// Helper for computing alignment of a device function byval parameter. |
4628 | Align NVPTXTargetLowering::getFunctionByValParamAlign( |
4629 | const Function *F, Type *ArgTy, Align InitialAlign, |
4630 | const DataLayout &DL) const { |
4631 | Align ArgAlign = InitialAlign; |
4632 | // Try to increase alignment to enhance vectorization options. |
4633 | if (F) |
4634 | ArgAlign = std::max(a: ArgAlign, b: getFunctionParamOptimizedAlign(F, ArgTy, DL)); |
4635 | |
4636 | // Old ptx versions have a bug. When PTX code takes address of |
4637 | // byval parameter with alignment < 4, ptxas generates code to |
4638 | // spill argument into memory. Alas on sm_50+ ptxas generates |
4639 | // SASS code that fails with misaligned access. To work around |
4640 | // the problem, make sure that we align byval parameters by at |
4641 | // least 4. This bug seems to be fixed at least starting from |
4642 | // ptxas > 9.0. |
4643 | // TODO: remove this after verifying the bug is not reproduced |
4644 | // on non-deprecated ptxas versions. |
4645 | if (ForceMinByValParamAlign) |
4646 | ArgAlign = std::max(a: ArgAlign, b: Align(4)); |
4647 | |
4648 | return ArgAlign; |
4649 | } |
4650 | |
4651 | // Helper for getting a function parameter name. Name is composed from |
4652 | // its index and the function name. Negative index corresponds to special |
4653 | // parameter (unsized array) used for passing variable arguments. |
4654 | std::string NVPTXTargetLowering::getParamName(const Function *F, |
4655 | int Idx) const { |
4656 | std::string ParamName; |
4657 | raw_string_ostream ParamStr(ParamName); |
4658 | |
4659 | ParamStr << getTargetMachine().getSymbol(GV: F)->getName(); |
4660 | if (Idx < 0) |
4661 | ParamStr << "_vararg" ; |
4662 | else |
4663 | ParamStr << "_param_" << Idx; |
4664 | |
4665 | return ParamName; |
4666 | } |
4667 | |
4668 | /// isLegalAddressingMode - Return true if the addressing mode represented |
4669 | /// by AM is legal for this target, for a load/store of the specified type. |
4670 | /// Used to guide target specific optimizations, like loop strength reduction |
4671 | /// (LoopStrengthReduce.cpp) and memory optimization for address mode |
4672 | /// (CodeGenPrepare.cpp) |
4673 | bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, |
4674 | const AddrMode &AM, Type *Ty, |
4675 | unsigned AS, Instruction *I) const { |
4676 | // AddrMode - This represents an addressing mode of: |
4677 | // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg |
4678 | // |
4679 | // The legal address modes are |
4680 | // - [avar] |
4681 | // - [areg] |
4682 | // - [areg+immoff] |
4683 | // - [immAddr] |
4684 | |
4685 | // immoff must fit in a signed 32-bit int |
4686 | if (!APInt(64, AM.BaseOffs).isSignedIntN(N: 32)) |
4687 | return false; |
4688 | |
4689 | if (AM.BaseGV) |
4690 | return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; |
4691 | |
4692 | switch (AM.Scale) { |
4693 | case 0: // "r", "r+i" or "i" is allowed |
4694 | break; |
4695 | case 1: |
4696 | if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. |
4697 | return false; |
4698 | // Otherwise we have r+i. |
4699 | break; |
4700 | default: |
4701 | // No scale > 1 is allowed |
4702 | return false; |
4703 | } |
4704 | return true; |
4705 | } |
4706 | |
4707 | //===----------------------------------------------------------------------===// |
4708 | // NVPTX Inline Assembly Support |
4709 | //===----------------------------------------------------------------------===// |
4710 | |
4711 | /// getConstraintType - Given a constraint letter, return the type of |
4712 | /// constraint it is for this target. |
4713 | NVPTXTargetLowering::ConstraintType |
4714 | NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { |
4715 | if (Constraint.size() == 1) { |
4716 | switch (Constraint[0]) { |
4717 | default: |
4718 | break; |
4719 | case 'b': |
4720 | case 'r': |
4721 | case 'h': |
4722 | case 'c': |
4723 | case 'l': |
4724 | case 'f': |
4725 | case 'd': |
4726 | case 'q': |
4727 | case '0': |
4728 | case 'N': |
4729 | return C_RegisterClass; |
4730 | } |
4731 | } |
4732 | return TargetLowering::getConstraintType(Constraint); |
4733 | } |
4734 | |
4735 | std::pair<unsigned, const TargetRegisterClass *> |
4736 | NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, |
4737 | StringRef Constraint, |
4738 | MVT VT) const { |
4739 | if (Constraint.size() == 1) { |
4740 | switch (Constraint[0]) { |
4741 | case 'b': |
4742 | return std::make_pair(x: 0U, y: &NVPTX::B1RegClass); |
4743 | case 'c': |
4744 | case 'h': |
4745 | return std::make_pair(x: 0U, y: &NVPTX::B16RegClass); |
4746 | case 'r': |
4747 | case 'f': |
4748 | return std::make_pair(x: 0U, y: &NVPTX::B32RegClass); |
4749 | case 'l': |
4750 | case 'N': |
4751 | case 'd': |
4752 | return std::make_pair(x: 0U, y: &NVPTX::B64RegClass); |
4753 | case 'q': { |
4754 | if (STI.getSmVersion() < 70) |
4755 | report_fatal_error(reason: "Inline asm with 128 bit operands is only " |
4756 | "supported for sm_70 and higher!" ); |
4757 | return std::make_pair(x: 0U, y: &NVPTX::B128RegClass); |
4758 | } |
4759 | } |
4760 | } |
4761 | return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); |
4762 | } |
4763 | |
4764 | //===----------------------------------------------------------------------===// |
4765 | // NVPTX DAG Combining |
4766 | //===----------------------------------------------------------------------===// |
4767 | |
4768 | bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, |
4769 | CodeGenOptLevel OptLevel) const { |
4770 | // Always honor command-line argument |
4771 | if (FMAContractLevelOpt.getNumOccurrences() > 0) |
4772 | return FMAContractLevelOpt > 0; |
4773 | |
4774 | // Do not contract if we're not optimizing the code. |
4775 | if (OptLevel == CodeGenOptLevel::None) |
4776 | return false; |
4777 | |
4778 | // Honor TargetOptions flags that explicitly say fusion is okay. |
4779 | if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) |
4780 | return true; |
4781 | |
4782 | return allowUnsafeFPMath(MF); |
4783 | } |
4784 | |
4785 | bool NVPTXTargetLowering::allowUnsafeFPMath(const MachineFunction &MF) const { |
4786 | // Honor TargetOptions flags that explicitly say unsafe math is okay. |
4787 | if (MF.getTarget().Options.UnsafeFPMath) |
4788 | return true; |
4789 | |
4790 | // Allow unsafe math if unsafe-fp-math attribute explicitly says so. |
4791 | const Function &F = MF.getFunction(); |
4792 | return F.getFnAttribute(Kind: "unsafe-fp-math" ).getValueAsBool(); |
4793 | } |
4794 | |
4795 | static bool isConstZero(const SDValue &Operand) { |
4796 | const auto *Const = dyn_cast<ConstantSDNode>(Val: Operand); |
4797 | return Const && Const->getZExtValue() == 0; |
4798 | } |
4799 | |
4800 | /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with |
4801 | /// operands N0 and N1. This is a helper for PerformADDCombine that is |
4802 | /// called with the default operands, and if that fails, with commuted |
4803 | /// operands. |
4804 | static SDValue |
4805 | PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, |
4806 | TargetLowering::DAGCombinerInfo &DCI) { |
4807 | EVT VT = N0.getValueType(); |
4808 | |
4809 | // Since integer multiply-add costs the same as integer multiply |
4810 | // but is more costly than integer add, do the fusion only when |
4811 | // the mul is only used in the add. |
4812 | // TODO: this may not be true for later architectures, consider relaxing this |
4813 | if (!N0.getNode()->hasOneUse()) |
4814 | return SDValue(); |
4815 | |
4816 | // fold (add (select cond, 0, (mul a, b)), c) |
4817 | // -> (select cond, c, (add (mul a, b), c)) |
4818 | // |
4819 | if (N0.getOpcode() == ISD::SELECT) { |
4820 | unsigned ZeroOpNum; |
4821 | if (isConstZero(Operand: N0->getOperand(Num: 1))) |
4822 | ZeroOpNum = 1; |
4823 | else if (isConstZero(Operand: N0->getOperand(Num: 2))) |
4824 | ZeroOpNum = 2; |
4825 | else |
4826 | return SDValue(); |
4827 | |
4828 | SDValue M = N0->getOperand(Num: (ZeroOpNum == 1) ? 2 : 1); |
4829 | if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse()) |
4830 | return SDValue(); |
4831 | |
4832 | SDLoc DL(N); |
4833 | SDValue Mul = |
4834 | DCI.DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: M->getOperand(Num: 0), N2: M->getOperand(Num: 1)); |
4835 | SDValue MAD = DCI.DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mul, N2: N1); |
4836 | return DCI.DAG.getSelect(DL: SDLoc(N), VT, Cond: N0->getOperand(Num: 0), |
4837 | LHS: ((ZeroOpNum == 1) ? N1 : MAD), |
4838 | RHS: ((ZeroOpNum == 1) ? MAD : N1)); |
4839 | } |
4840 | |
4841 | return SDValue(); |
4842 | } |
4843 | |
4844 | static SDValue |
4845 | PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, |
4846 | TargetLowering::DAGCombinerInfo &DCI, |
4847 | CodeGenOptLevel OptLevel) { |
4848 | EVT VT = N0.getValueType(); |
4849 | if (N0.getOpcode() == ISD::FMUL) { |
4850 | const auto *TLI = static_cast<const NVPTXTargetLowering *>( |
4851 | &DCI.DAG.getTargetLoweringInfo()); |
4852 | if (!(TLI->allowFMA(MF&: DCI.DAG.getMachineFunction(), OptLevel) || |
4853 | (N->getFlags().hasAllowContract() && |
4854 | N0->getFlags().hasAllowContract()))) |
4855 | return SDValue(); |
4856 | |
4857 | // For floating point: |
4858 | // Do the fusion only when the mul has less than 5 uses and all |
4859 | // are add. |
4860 | // The heuristic is that if a use is not an add, then that use |
4861 | // cannot be fused into fma, therefore mul is still needed anyway. |
4862 | // If there are more than 4 uses, even if they are all add, fusing |
4863 | // them will increase register pressue. |
4864 | // |
4865 | int numUses = 0; |
4866 | int nonAddCount = 0; |
4867 | for (const SDNode *User : N0.getNode()->users()) { |
4868 | numUses++; |
4869 | if (User->getOpcode() != ISD::FADD) |
4870 | ++nonAddCount; |
4871 | if (numUses >= 5) |
4872 | return SDValue(); |
4873 | } |
4874 | if (nonAddCount) { |
4875 | int orderNo = N->getIROrder(); |
4876 | int orderNo2 = N0.getNode()->getIROrder(); |
4877 | // simple heuristics here for considering potential register |
4878 | // pressure, the logics here is that the differnce are used |
4879 | // to measure the distance between def and use, the longer distance |
4880 | // more likely cause register pressure. |
4881 | if (orderNo - orderNo2 < 500) |
4882 | return SDValue(); |
4883 | |
4884 | // Now, check if at least one of the FMUL's operands is live beyond the |
4885 | // node N, which guarantees that the FMA will not increase register |
4886 | // pressure at node N. |
4887 | bool opIsLive = false; |
4888 | const SDNode *left = N0.getOperand(i: 0).getNode(); |
4889 | const SDNode *right = N0.getOperand(i: 1).getNode(); |
4890 | |
4891 | if (isa<ConstantSDNode>(Val: left) || isa<ConstantSDNode>(Val: right)) |
4892 | opIsLive = true; |
4893 | |
4894 | if (!opIsLive) |
4895 | for (const SDNode *User : left->users()) { |
4896 | int orderNo3 = User->getIROrder(); |
4897 | if (orderNo3 > orderNo) { |
4898 | opIsLive = true; |
4899 | break; |
4900 | } |
4901 | } |
4902 | |
4903 | if (!opIsLive) |
4904 | for (const SDNode *User : right->users()) { |
4905 | int orderNo3 = User->getIROrder(); |
4906 | if (orderNo3 > orderNo) { |
4907 | opIsLive = true; |
4908 | break; |
4909 | } |
4910 | } |
4911 | |
4912 | if (!opIsLive) |
4913 | return SDValue(); |
4914 | } |
4915 | |
4916 | return DCI.DAG.getNode(Opcode: ISD::FMA, DL: SDLoc(N), VT, N1: N0.getOperand(i: 0), |
4917 | N2: N0.getOperand(i: 1), N3: N1); |
4918 | } |
4919 | |
4920 | return SDValue(); |
4921 | } |
4922 | |
4923 | /// Fold extractelts into a load by increasing the number of return values. |
4924 | /// |
4925 | /// ex: |
4926 | /// L: v2f16,ch = load <p> |
4927 | /// a: f16 = extractelt L:0, 0 |
4928 | /// b: f16 = extractelt L:0, 1 |
4929 | /// use(a, b) |
4930 | /// |
4931 | /// ...is turned into... |
4932 | /// L: f16,f16,ch = LoadV2 <p> |
4933 | /// use(L:0, L:1) |
4934 | static SDValue |
4935 | combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { |
4936 | // Don't run this optimization before the legalizer |
4937 | if (!DCI.isAfterLegalizeDAG()) |
4938 | return SDValue(); |
4939 | |
4940 | EVT ElemVT = N->getValueType(ResNo: 0); |
4941 | if (!Isv2x16VT(VT: ElemVT)) |
4942 | return SDValue(); |
4943 | |
4944 | // Check whether all outputs are either used by an extractelt or are |
4945 | // glue/chain nodes |
4946 | if (!all_of(Range: N->uses(), P: [&](SDUse &U) { |
4947 | // Skip glue, chain nodes |
4948 | if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other) |
4949 | return true; |
4950 | if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { |
4951 | if (N->getOpcode() != ISD::LOAD) |
4952 | return true; |
4953 | // Since this is an ISD::LOAD, check all extractelts are used. If |
4954 | // any are not used, we don't want to defeat another optimization that |
4955 | // will narrow the load. |
4956 | // |
4957 | // For example: |
4958 | // |
4959 | // L: v2f16,ch = load <p> |
4960 | // e0: f16 = extractelt L:0, 0 |
4961 | // e1: f16 = extractelt L:0, 1 <-- unused |
4962 | // store e0 |
4963 | // |
4964 | // Can be optimized by DAGCombiner to: |
4965 | // |
4966 | // L: f16,ch = load <p> |
4967 | // store L:0 |
4968 | return !U.getUser()->use_empty(); |
4969 | } |
4970 | |
4971 | // Otherwise, this use prevents us from splitting a value. |
4972 | return false; |
4973 | })) |
4974 | return SDValue(); |
4975 | |
4976 | auto *LD = cast<MemSDNode>(Val: N); |
4977 | EVT MemVT = LD->getMemoryVT(); |
4978 | SDLoc DL(LD); |
4979 | |
4980 | // the new opcode after we double the number of operands |
4981 | NVPTXISD::NodeType Opcode; |
4982 | SmallVector<SDValue> Operands(LD->ops()); |
4983 | unsigned OldNumOutputs; // non-glue, non-chain outputs |
4984 | switch (LD->getOpcode()) { |
4985 | case ISD::LOAD: |
4986 | OldNumOutputs = 1; |
4987 | // Any packed type is legal, so the legalizer will not have lowered |
4988 | // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it |
4989 | // here. |
4990 | Opcode = NVPTXISD::LoadV2; |
4991 | Operands.push_back(Elt: DCI.DAG.getIntPtrConstant( |
4992 | Val: cast<LoadSDNode>(Val: LD)->getExtensionType(), DL)); |
4993 | break; |
4994 | case NVPTXISD::LoadParamV2: |
4995 | OldNumOutputs = 2; |
4996 | Opcode = NVPTXISD::LoadParamV4; |
4997 | break; |
4998 | case NVPTXISD::LoadV2: |
4999 | OldNumOutputs = 2; |
5000 | Opcode = NVPTXISD::LoadV4; |
5001 | break; |
5002 | case NVPTXISD::LoadV4: |
5003 | case NVPTXISD::LoadV8: |
5004 | // PTX doesn't support the next doubling of outputs |
5005 | return SDValue(); |
5006 | } |
5007 | |
5008 | // the non-glue, non-chain outputs in the new load |
5009 | const unsigned NewNumOutputs = OldNumOutputs * 2; |
5010 | SmallVector<EVT> NewVTs(NewNumOutputs, ElemVT.getVectorElementType()); |
5011 | // add remaining chain and glue values |
5012 | NewVTs.append(in_start: LD->value_begin() + OldNumOutputs, in_end: LD->value_end()); |
5013 | |
5014 | // Create the new load |
5015 | SDValue NewLoad = |
5016 | DCI.DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: DCI.DAG.getVTList(VTs: NewVTs), |
5017 | Ops: Operands, MemVT, MMO: LD->getMemOperand()); |
5018 | |
5019 | // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep |
5020 | // the outputs the same. These nodes will be optimized away in later |
5021 | // DAGCombiner iterations. |
5022 | SmallVector<SDValue> Results; |
5023 | for (unsigned I : seq(Size: OldNumOutputs)) |
5024 | Results.push_back(Elt: DCI.DAG.getBuildVector( |
5025 | VT: ElemVT, DL, Ops: {NewLoad.getValue(R: I * 2), NewLoad.getValue(R: I * 2 + 1)})); |
5026 | // Add remaining chain and glue nodes |
5027 | for (unsigned I : seq(Size: NewLoad->getNumValues() - NewNumOutputs)) |
5028 | Results.push_back(Elt: NewLoad.getValue(R: NewNumOutputs + I)); |
5029 | |
5030 | return DCI.DAG.getMergeValues(Ops: Results, dl: DL); |
5031 | } |
5032 | |
5033 | /// Fold a packing mov into a store. |
5034 | /// |
5035 | /// ex: |
5036 | /// v: v2f16 = BUILD_VECTOR a:f16, b:f16 |
5037 | /// StoreRetval v |
5038 | /// |
5039 | /// ...is turned into... |
5040 | /// |
5041 | /// StoreRetvalV2 a:f16, b:f16 |
5042 | static SDValue combinePackingMovIntoStore(SDNode *N, |
5043 | TargetLowering::DAGCombinerInfo &DCI, |
5044 | unsigned Front, unsigned Back) { |
5045 | // We want to run this as late as possible since other optimizations may |
5046 | // eliminate the BUILD_VECTORs. |
5047 | if (!DCI.isAfterLegalizeDAG()) |
5048 | return SDValue(); |
5049 | |
5050 | // Get the type of the operands being stored. |
5051 | EVT ElementVT = N->getOperand(Num: Front).getValueType(); |
5052 | |
5053 | if (!Isv2x16VT(VT: ElementVT)) |
5054 | return SDValue(); |
5055 | |
5056 | auto *ST = cast<MemSDNode>(Val: N); |
5057 | EVT MemVT = ElementVT.getVectorElementType(); |
5058 | |
5059 | // The new opcode after we double the number of operands. |
5060 | NVPTXISD::NodeType Opcode; |
5061 | switch (N->getOpcode()) { |
5062 | case ISD::STORE: |
5063 | // Any packed type is legal, so the legalizer will not have lowered |
5064 | // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do |
5065 | // it here. |
5066 | MemVT = ST->getMemoryVT(); |
5067 | Opcode = NVPTXISD::StoreV2; |
5068 | break; |
5069 | case NVPTXISD::StoreParam: |
5070 | Opcode = NVPTXISD::StoreParamV2; |
5071 | break; |
5072 | case NVPTXISD::StoreParamV2: |
5073 | Opcode = NVPTXISD::StoreParamV4; |
5074 | break; |
5075 | case NVPTXISD::StoreV2: |
5076 | MemVT = ST->getMemoryVT(); |
5077 | Opcode = NVPTXISD::StoreV4; |
5078 | break; |
5079 | case NVPTXISD::StoreV4: |
5080 | case NVPTXISD::StoreParamV4: |
5081 | case NVPTXISD::StoreV8: |
5082 | // PTX doesn't support the next doubling of operands |
5083 | return SDValue(); |
5084 | default: |
5085 | llvm_unreachable("Unhandled store opcode" ); |
5086 | } |
5087 | |
5088 | // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered |
5089 | // their elements. |
5090 | SmallVector<SDValue, 4> Operands(N->ops().take_front(N: Front)); |
5091 | for (SDValue BV : N->ops().drop_front(N: Front).drop_back(N: Back)) { |
5092 | if (BV.getOpcode() != ISD::BUILD_VECTOR) |
5093 | return SDValue(); |
5094 | |
5095 | // If the operand has multiple uses, this optimization can increase register |
5096 | // pressure. |
5097 | if (!BV.hasOneUse()) |
5098 | return SDValue(); |
5099 | |
5100 | // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for |
5101 | // any signs they may be folded by some other pattern or rule. |
5102 | for (SDValue Op : BV->ops()) { |
5103 | // Peek through bitcasts |
5104 | if (Op.getOpcode() == ISD::BITCAST) |
5105 | Op = Op.getOperand(i: 0); |
5106 | |
5107 | // This may be folded into a PRMT. |
5108 | if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE && |
5109 | Op->getOperand(Num: 0).getValueType() == MVT::i32) |
5110 | return SDValue(); |
5111 | |
5112 | // This may be folded into cvt.bf16x2 |
5113 | if (Op.getOpcode() == ISD::FP_ROUND) |
5114 | return SDValue(); |
5115 | } |
5116 | Operands.append(IL: {BV.getOperand(i: 0), BV.getOperand(i: 1)}); |
5117 | } |
5118 | Operands.append(in_start: N->op_end() - Back, in_end: N->op_end()); |
5119 | |
5120 | // Now we replace the store |
5121 | return DCI.DAG.getMemIntrinsicNode(Opcode, dl: SDLoc(N), VTList: N->getVTList(), Ops: Operands, |
5122 | MemVT, MMO: ST->getMemOperand()); |
5123 | } |
5124 | |
5125 | static SDValue PerformStoreCombineHelper(SDNode *N, |
5126 | TargetLowering::DAGCombinerInfo &DCI, |
5127 | unsigned Front, unsigned Back) { |
5128 | if (all_of(Range: N->ops().drop_front(N: Front).drop_back(N: Back), |
5129 | P: [](const SDUse &U) { return U.get()->isUndef(); })) |
5130 | // Operand 0 is the previous value in the chain. Cannot return EntryToken |
5131 | // as the previous value will become unused and eliminated later. |
5132 | return N->getOperand(Num: 0); |
5133 | |
5134 | return combinePackingMovIntoStore(N, DCI, Front, Back); |
5135 | } |
5136 | |
5137 | static SDValue PerformStoreCombine(SDNode *N, |
5138 | TargetLowering::DAGCombinerInfo &DCI) { |
5139 | return combinePackingMovIntoStore(N, DCI, Front: 1, Back: 2); |
5140 | } |
5141 | |
5142 | static SDValue PerformStoreParamCombine(SDNode *N, |
5143 | TargetLowering::DAGCombinerInfo &DCI) { |
5144 | // Operands from the 3rd to the 2nd last one are the values to be stored. |
5145 | // {Chain, ArgID, Offset, Val, Glue} |
5146 | return PerformStoreCombineHelper(N, DCI, Front: 3, Back: 1); |
5147 | } |
5148 | |
5149 | /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. |
5150 | /// |
5151 | static SDValue PerformADDCombine(SDNode *N, |
5152 | TargetLowering::DAGCombinerInfo &DCI, |
5153 | CodeGenOptLevel OptLevel) { |
5154 | if (OptLevel == CodeGenOptLevel::None) |
5155 | return SDValue(); |
5156 | |
5157 | SDValue N0 = N->getOperand(Num: 0); |
5158 | SDValue N1 = N->getOperand(Num: 1); |
5159 | |
5160 | // Skip non-integer, non-scalar case |
5161 | EVT VT = N0.getValueType(); |
5162 | if (VT.isVector() || VT != MVT::i32) |
5163 | return SDValue(); |
5164 | |
5165 | // First try with the default operand order. |
5166 | if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI)) |
5167 | return Result; |
5168 | |
5169 | // If that didn't work, try again with the operands commuted. |
5170 | return PerformADDCombineWithOperands(N, N0: N1, N1: N0, DCI); |
5171 | } |
5172 | |
5173 | /// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD. |
5174 | /// |
5175 | static SDValue PerformFADDCombine(SDNode *N, |
5176 | TargetLowering::DAGCombinerInfo &DCI, |
5177 | CodeGenOptLevel OptLevel) { |
5178 | SDValue N0 = N->getOperand(Num: 0); |
5179 | SDValue N1 = N->getOperand(Num: 1); |
5180 | |
5181 | EVT VT = N0.getValueType(); |
5182 | if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64)) |
5183 | return SDValue(); |
5184 | |
5185 | // First try with the default operand order. |
5186 | if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel)) |
5187 | return Result; |
5188 | |
5189 | // If that didn't work, try again with the operands commuted. |
5190 | return PerformFADDCombineWithOperands(N, N0: N1, N1: N0, DCI, OptLevel); |
5191 | } |
5192 | |
5193 | static SDValue PerformANDCombine(SDNode *N, |
5194 | TargetLowering::DAGCombinerInfo &DCI) { |
5195 | // The type legalizer turns a vector load of i8 values into a zextload to i16 |
5196 | // registers, optionally ANY_EXTENDs it (if target type is integer), |
5197 | // and ANDs off the high 8 bits. Since we turn this load into a |
5198 | // target-specific DAG node, the DAG combiner fails to eliminate these AND |
5199 | // nodes. Do that here. |
5200 | SDValue Val = N->getOperand(Num: 0); |
5201 | SDValue Mask = N->getOperand(Num: 1); |
5202 | |
5203 | if (isa<ConstantSDNode>(Val)) { |
5204 | std::swap(a&: Val, b&: Mask); |
5205 | } |
5206 | |
5207 | SDValue AExt; |
5208 | |
5209 | // Convert BFE-> truncate i16 -> and 255 |
5210 | // To just BFE-> truncate i16, as the value already has all the bits in the |
5211 | // right places. |
5212 | if (Val.getOpcode() == ISD::TRUNCATE) { |
5213 | SDValue BFE = Val.getOperand(i: 0); |
5214 | if (BFE.getOpcode() != NVPTXISD::BFE) |
5215 | return SDValue(); |
5216 | |
5217 | ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(Val: BFE.getOperand(i: 0)); |
5218 | if (!BFEBits) |
5219 | return SDValue(); |
5220 | uint64_t BFEBitsVal = BFEBits->getZExtValue(); |
5221 | |
5222 | ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: Mask); |
5223 | if (!MaskCnst) { |
5224 | // Not an AND with a constant |
5225 | return SDValue(); |
5226 | } |
5227 | uint64_t MaskVal = MaskCnst->getZExtValue(); |
5228 | |
5229 | if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) |
5230 | return SDValue(); |
5231 | // If we get here, the AND is unnecessary. Just replace it with the trunc |
5232 | DCI.CombineTo(N, Res: Val, AddTo: false); |
5233 | } |
5234 | // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and |
5235 | if (Val.getOpcode() == ISD::ANY_EXTEND) { |
5236 | AExt = Val; |
5237 | Val = Val->getOperand(Num: 0); |
5238 | } |
5239 | |
5240 | if (Val->getOpcode() == NVPTXISD::LoadV2 || |
5241 | Val->getOpcode() == NVPTXISD::LoadV4) { |
5242 | ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: Mask); |
5243 | if (!MaskCnst) { |
5244 | // Not an AND with a constant |
5245 | return SDValue(); |
5246 | } |
5247 | |
5248 | uint64_t MaskVal = MaskCnst->getZExtValue(); |
5249 | if (MaskVal != 0xff) { |
5250 | // Not an AND that chops off top 8 bits |
5251 | return SDValue(); |
5252 | } |
5253 | |
5254 | MemSDNode *Mem = dyn_cast<MemSDNode>(Val); |
5255 | if (!Mem) { |
5256 | // Not a MemSDNode?!? |
5257 | return SDValue(); |
5258 | } |
5259 | |
5260 | EVT MemVT = Mem->getMemoryVT(); |
5261 | if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { |
5262 | // We only handle the i8 case |
5263 | return SDValue(); |
5264 | } |
5265 | |
5266 | unsigned ExtType = Val->getConstantOperandVal(Num: Val->getNumOperands() - 1); |
5267 | if (ExtType == ISD::SEXTLOAD) { |
5268 | // If for some reason the load is a sextload, the and is needed to zero |
5269 | // out the high 8 bits |
5270 | return SDValue(); |
5271 | } |
5272 | |
5273 | bool AddTo = false; |
5274 | if (AExt.getNode() != nullptr) { |
5275 | // Re-insert the ext as a zext. |
5276 | Val = DCI.DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(N), |
5277 | VT: AExt.getValueType(), Operand: Val); |
5278 | AddTo = true; |
5279 | } |
5280 | |
5281 | // If we get here, the AND is unnecessary. Just replace it with the load |
5282 | DCI.CombineTo(N, Res: Val, AddTo); |
5283 | } |
5284 | |
5285 | return SDValue(); |
5286 | } |
5287 | |
5288 | static SDValue PerformREMCombine(SDNode *N, |
5289 | TargetLowering::DAGCombinerInfo &DCI, |
5290 | CodeGenOptLevel OptLevel) { |
5291 | assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); |
5292 | |
5293 | // Don't do anything at less than -O2. |
5294 | if (OptLevel < CodeGenOptLevel::Default) |
5295 | return SDValue(); |
5296 | |
5297 | SelectionDAG &DAG = DCI.DAG; |
5298 | SDLoc DL(N); |
5299 | EVT VT = N->getValueType(ResNo: 0); |
5300 | bool IsSigned = N->getOpcode() == ISD::SREM; |
5301 | unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; |
5302 | |
5303 | const SDValue &Num = N->getOperand(Num: 0); |
5304 | const SDValue &Den = N->getOperand(Num: 1); |
5305 | |
5306 | for (const SDNode *U : Num->users()) { |
5307 | if (U->getOpcode() == DivOpc && U->getOperand(Num: 0) == Num && |
5308 | U->getOperand(Num: 1) == Den) { |
5309 | // Num % Den -> Num - (Num / Den) * Den |
5310 | return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Num, |
5311 | N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, |
5312 | N1: DAG.getNode(Opcode: DivOpc, DL, VT, N1: Num, N2: Den), |
5313 | N2: Den)); |
5314 | } |
5315 | } |
5316 | return SDValue(); |
5317 | } |
5318 | |
5319 | enum OperandSignedness { |
5320 | Signed = 0, |
5321 | Unsigned, |
5322 | Unknown |
5323 | }; |
5324 | |
5325 | /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand |
5326 | /// that can be demoted to \p OptSize bits without loss of information. The |
5327 | /// signedness of the operand, if determinable, is placed in \p S. |
5328 | static bool IsMulWideOperandDemotable(SDValue Op, |
5329 | unsigned OptSize, |
5330 | OperandSignedness &S) { |
5331 | S = Unknown; |
5332 | |
5333 | if (Op.getOpcode() == ISD::SIGN_EXTEND || |
5334 | Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { |
5335 | EVT OrigVT = Op.getOperand(i: 0).getValueType(); |
5336 | if (OrigVT.getFixedSizeInBits() <= OptSize) { |
5337 | S = Signed; |
5338 | return true; |
5339 | } |
5340 | } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { |
5341 | EVT OrigVT = Op.getOperand(i: 0).getValueType(); |
5342 | if (OrigVT.getFixedSizeInBits() <= OptSize) { |
5343 | S = Unsigned; |
5344 | return true; |
5345 | } |
5346 | } |
5347 | |
5348 | return false; |
5349 | } |
5350 | |
5351 | /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can |
5352 | /// be demoted to \p OptSize bits without loss of information. If the operands |
5353 | /// contain a constant, it should appear as the RHS operand. The signedness of |
5354 | /// the operands is placed in \p IsSigned. |
5355 | static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, |
5356 | unsigned OptSize, |
5357 | bool &IsSigned) { |
5358 | OperandSignedness LHSSign; |
5359 | |
5360 | // The LHS operand must be a demotable op |
5361 | if (!IsMulWideOperandDemotable(Op: LHS, OptSize, S&: LHSSign)) |
5362 | return false; |
5363 | |
5364 | // We should have been able to determine the signedness from the LHS |
5365 | if (LHSSign == Unknown) |
5366 | return false; |
5367 | |
5368 | IsSigned = (LHSSign == Signed); |
5369 | |
5370 | // The RHS can be a demotable op or a constant |
5371 | if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val&: RHS)) { |
5372 | const APInt &Val = CI->getAPIntValue(); |
5373 | if (LHSSign == Unsigned) { |
5374 | return Val.isIntN(N: OptSize); |
5375 | } else { |
5376 | return Val.isSignedIntN(N: OptSize); |
5377 | } |
5378 | } else { |
5379 | OperandSignedness RHSSign; |
5380 | if (!IsMulWideOperandDemotable(Op: RHS, OptSize, S&: RHSSign)) |
5381 | return false; |
5382 | |
5383 | return LHSSign == RHSSign; |
5384 | } |
5385 | } |
5386 | |
5387 | /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply |
5388 | /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform |
5389 | /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift |
5390 | /// amount. |
5391 | static SDValue TryMULWIDECombine(SDNode *N, |
5392 | TargetLowering::DAGCombinerInfo &DCI) { |
5393 | EVT MulType = N->getValueType(ResNo: 0); |
5394 | if (MulType != MVT::i32 && MulType != MVT::i64) { |
5395 | return SDValue(); |
5396 | } |
5397 | |
5398 | SDLoc DL(N); |
5399 | unsigned OptSize = MulType.getSizeInBits() >> 1; |
5400 | SDValue LHS = N->getOperand(Num: 0); |
5401 | SDValue RHS = N->getOperand(Num: 1); |
5402 | |
5403 | // Canonicalize the multiply so the constant (if any) is on the right |
5404 | if (N->getOpcode() == ISD::MUL) { |
5405 | if (isa<ConstantSDNode>(Val: LHS)) { |
5406 | std::swap(a&: LHS, b&: RHS); |
5407 | } |
5408 | } |
5409 | |
5410 | // If we have a SHL, determine the actual multiply amount |
5411 | if (N->getOpcode() == ISD::SHL) { |
5412 | ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(Val&: RHS); |
5413 | if (!ShlRHS) { |
5414 | return SDValue(); |
5415 | } |
5416 | |
5417 | APInt ShiftAmt = ShlRHS->getAPIntValue(); |
5418 | unsigned BitWidth = MulType.getSizeInBits(); |
5419 | if (ShiftAmt.sge(RHS: 0) && ShiftAmt.slt(RHS: BitWidth)) { |
5420 | APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; |
5421 | RHS = DCI.DAG.getConstant(Val: MulVal, DL, VT: MulType); |
5422 | } else { |
5423 | return SDValue(); |
5424 | } |
5425 | } |
5426 | |
5427 | bool Signed; |
5428 | // Verify that our operands are demotable |
5429 | if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, IsSigned&: Signed)) { |
5430 | return SDValue(); |
5431 | } |
5432 | |
5433 | EVT DemotedVT; |
5434 | if (MulType == MVT::i32) { |
5435 | DemotedVT = MVT::i16; |
5436 | } else { |
5437 | DemotedVT = MVT::i32; |
5438 | } |
5439 | |
5440 | // Truncate the operands to the correct size. Note that these are just for |
5441 | // type consistency and will (likely) be eliminated in later phases. |
5442 | SDValue TruncLHS = |
5443 | DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DemotedVT, Operand: LHS); |
5444 | SDValue TruncRHS = |
5445 | DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DemotedVT, Operand: RHS); |
5446 | |
5447 | unsigned Opc; |
5448 | if (Signed) { |
5449 | Opc = NVPTXISD::MUL_WIDE_SIGNED; |
5450 | } else { |
5451 | Opc = NVPTXISD::MUL_WIDE_UNSIGNED; |
5452 | } |
5453 | |
5454 | return DCI.DAG.getNode(Opcode: Opc, DL, VT: MulType, N1: TruncLHS, N2: TruncRHS); |
5455 | } |
5456 | |
5457 | static bool isConstOne(const SDValue &Operand) { |
5458 | const auto *Const = dyn_cast<ConstantSDNode>(Val: Operand); |
5459 | return Const && Const->getZExtValue() == 1; |
5460 | } |
5461 | |
5462 | static SDValue matchMADConstOnePattern(SDValue Add) { |
5463 | if (Add->getOpcode() != ISD::ADD) |
5464 | return SDValue(); |
5465 | |
5466 | if (isConstOne(Operand: Add->getOperand(Num: 0))) |
5467 | return Add->getOperand(Num: 1); |
5468 | |
5469 | if (isConstOne(Operand: Add->getOperand(Num: 1))) |
5470 | return Add->getOperand(Num: 0); |
5471 | |
5472 | return SDValue(); |
5473 | } |
5474 | |
5475 | static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, |
5476 | TargetLowering::DAGCombinerInfo &DCI) { |
5477 | |
5478 | if (SDValue Y = matchMADConstOnePattern(Add)) { |
5479 | SDValue Mul = DCI.DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: X, N2: Y); |
5480 | return DCI.DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mul, N2: X); |
5481 | } |
5482 | |
5483 | return SDValue(); |
5484 | } |
5485 | |
5486 | static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, |
5487 | SDLoc DL, |
5488 | TargetLowering::DAGCombinerInfo &DCI) { |
5489 | if (Select->getOpcode() != ISD::SELECT) |
5490 | return SDValue(); |
5491 | |
5492 | SDValue Cond = Select->getOperand(Num: 0); |
5493 | |
5494 | unsigned ConstOpNo; |
5495 | if (isConstOne(Operand: Select->getOperand(Num: 1))) |
5496 | ConstOpNo = 1; |
5497 | else if (isConstOne(Operand: Select->getOperand(Num: 2))) |
5498 | ConstOpNo = 2; |
5499 | else |
5500 | return SDValue(); |
5501 | |
5502 | SDValue Y = Select->getOperand(Num: (ConstOpNo == 1) ? 2 : 1); |
5503 | |
5504 | // Do not combine if the resulting sequence is not obviously profitable. |
5505 | if (!matchMADConstOnePattern(Add: Y)) |
5506 | return SDValue(); |
5507 | |
5508 | SDValue NewMul = DCI.DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: X, N2: Y); |
5509 | |
5510 | return DCI.DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond, |
5511 | N2: (ConstOpNo == 1) ? X : NewMul, |
5512 | N3: (ConstOpNo == 1) ? NewMul : X); |
5513 | } |
5514 | |
5515 | static SDValue |
5516 | PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, |
5517 | TargetLowering::DAGCombinerInfo &DCI) { |
5518 | |
5519 | EVT VT = N0.getValueType(); |
5520 | if (VT.isVector()) |
5521 | return SDValue(); |
5522 | |
5523 | if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) |
5524 | return SDValue(); |
5525 | |
5526 | SDLoc DL(N); |
5527 | |
5528 | // (mul x, (add y, 1)) -> (add (mul x, y), x) |
5529 | if (SDValue Res = combineMADConstOne(X: N0, Add: N1, VT, DL, DCI)) |
5530 | return Res; |
5531 | if (SDValue Res = combineMADConstOne(X: N1, Add: N0, VT, DL, DCI)) |
5532 | return Res; |
5533 | |
5534 | // (mul x, (select y, 1)) -> (select (mul x, y), x) |
5535 | if (SDValue Res = combineMulSelectConstOne(X: N0, Select: N1, VT, DL, DCI)) |
5536 | return Res; |
5537 | if (SDValue Res = combineMulSelectConstOne(X: N1, Select: N0, VT, DL, DCI)) |
5538 | return Res; |
5539 | |
5540 | return SDValue(); |
5541 | } |
5542 | |
5543 | /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. |
5544 | static SDValue PerformMULCombine(SDNode *N, |
5545 | TargetLowering::DAGCombinerInfo &DCI, |
5546 | CodeGenOptLevel OptLevel) { |
5547 | if (OptLevel == CodeGenOptLevel::None) |
5548 | return SDValue(); |
5549 | |
5550 | if (SDValue Ret = TryMULWIDECombine(N, DCI)) |
5551 | return Ret; |
5552 | |
5553 | SDValue N0 = N->getOperand(Num: 0); |
5554 | SDValue N1 = N->getOperand(Num: 1); |
5555 | return PerformMULCombineWithOperands(N, N0, N1, DCI); |
5556 | } |
5557 | |
5558 | /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. |
5559 | static SDValue PerformSHLCombine(SDNode *N, |
5560 | TargetLowering::DAGCombinerInfo &DCI, |
5561 | CodeGenOptLevel OptLevel) { |
5562 | if (OptLevel > CodeGenOptLevel::None) { |
5563 | // Try mul.wide combining at OptLevel > 0 |
5564 | if (SDValue Ret = TryMULWIDECombine(N, DCI)) |
5565 | return Ret; |
5566 | } |
5567 | |
5568 | return SDValue(); |
5569 | } |
5570 | |
5571 | static SDValue PerformSETCCCombine(SDNode *N, |
5572 | TargetLowering::DAGCombinerInfo &DCI, |
5573 | unsigned int SmVersion) { |
5574 | EVT CCType = N->getValueType(ResNo: 0); |
5575 | SDValue A = N->getOperand(Num: 0); |
5576 | SDValue B = N->getOperand(Num: 1); |
5577 | |
5578 | EVT AType = A.getValueType(); |
5579 | if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16))) |
5580 | return SDValue(); |
5581 | |
5582 | if (A.getValueType() == MVT::v2bf16 && SmVersion < 90) |
5583 | return SDValue(); |
5584 | |
5585 | SDLoc DL(N); |
5586 | // setp.f16x2 returns two scalar predicates, which we need to |
5587 | // convert back to v2i1. The returned result will be scalarized by |
5588 | // the legalizer, but the comparison will remain a single vector |
5589 | // instruction. |
5590 | SDValue CCNode = DCI.DAG.getNode( |
5591 | Opcode: A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2 |
5592 | : NVPTXISD::SETP_BF16X2, |
5593 | DL, VTList: DCI.DAG.getVTList(VT1: MVT::i1, VT2: MVT::i1), Ops: {A, B, N->getOperand(Num: 2)}); |
5594 | return DCI.DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: CCType, N1: CCNode.getValue(R: 0), |
5595 | N2: CCNode.getValue(R: 1)); |
5596 | } |
5597 | |
5598 | static SDValue (SDNode *N, |
5599 | TargetLowering::DAGCombinerInfo &DCI) { |
5600 | SDValue Vector = N->getOperand(Num: 0); |
5601 | if (Vector->getOpcode() == ISD::FREEZE) |
5602 | Vector = Vector->getOperand(Num: 0); |
5603 | SDLoc DL(N); |
5604 | EVT VectorVT = Vector.getValueType(); |
5605 | if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && |
5606 | IsPTXVectorType(VT: VectorVT.getSimpleVT())) |
5607 | return SDValue(); // Native vector loads already combine nicely w/ |
5608 | // extract_vector_elt. |
5609 | // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already |
5610 | // handle them OK. |
5611 | if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VT: VectorVT) || |
5612 | VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8) |
5613 | return SDValue(); |
5614 | |
5615 | // Don't mess with undef values as sra may be simplified to 0, not undef. |
5616 | if (Vector->isUndef() || ISD::allOperandsUndef(N: Vector.getNode())) |
5617 | return SDValue(); |
5618 | |
5619 | uint64_t VectorBits = VectorVT.getSizeInBits(); |
5620 | // We only handle the types we can extract in-register. |
5621 | if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64)) |
5622 | return SDValue(); |
5623 | |
5624 | ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)); |
5625 | // Index == 0 is handled by generic DAG combiner. |
5626 | if (!Index || Index->getZExtValue() == 0) |
5627 | return SDValue(); |
5628 | |
5629 | MVT IVT = MVT::getIntegerVT(BitWidth: VectorBits); |
5630 | EVT EltVT = VectorVT.getVectorElementType(); |
5631 | EVT EltIVT = EltVT.changeTypeToInteger(); |
5632 | uint64_t EltBits = EltVT.getScalarSizeInBits(); |
5633 | |
5634 | SDValue Result = DCI.DAG.getNode( |
5635 | Opcode: ISD::TRUNCATE, DL, VT: EltIVT, |
5636 | Operand: DCI.DAG.getNode( |
5637 | Opcode: ISD::SRA, DL, VT: IVT, N1: DCI.DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IVT, Operand: Vector), |
5638 | N2: DCI.DAG.getConstant(Val: Index->getZExtValue() * EltBits, DL, VT: IVT))); |
5639 | |
5640 | // If element has non-integer type, bitcast it back to the expected type. |
5641 | if (EltVT != EltIVT) |
5642 | Result = DCI.DAG.getNode(Opcode: ISD::BITCAST, DL, VT: EltVT, Operand: Result); |
5643 | // Past legalizer, we may need to extent i8 -> i16 to match the register type. |
5644 | if (EltVT != N->getValueType(ResNo: 0)) |
5645 | Result = DCI.DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: N->getValueType(ResNo: 0), Operand: Result); |
5646 | |
5647 | return Result; |
5648 | } |
5649 | |
5650 | static SDValue PerformVSELECTCombine(SDNode *N, |
5651 | TargetLowering::DAGCombinerInfo &DCI) { |
5652 | SDValue VA = N->getOperand(Num: 1); |
5653 | EVT VectorVT = VA.getValueType(); |
5654 | if (VectorVT != MVT::v4i8) |
5655 | return SDValue(); |
5656 | |
5657 | // We need to split vselect into individual per-element operations Because we |
5658 | // use BFE/BFI instruction for byte extraction/insertion, we do end up with |
5659 | // 32-bit values, so we may as well do comparison as i32 to avoid conversions |
5660 | // to/from i16 normally used for i8 values. |
5661 | SmallVector<SDValue, 4> E; |
5662 | SDLoc DL(N); |
5663 | SDValue VCond = N->getOperand(Num: 0); |
5664 | SDValue VB = N->getOperand(Num: 2); |
5665 | for (int I = 0; I < 4; ++I) { |
5666 | SDValue C = DCI.DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i1, N1: VCond, |
5667 | N2: DCI.DAG.getConstant(Val: I, DL, VT: MVT::i32)); |
5668 | SDValue EA = DCI.DAG.getAnyExtOrTrunc( |
5669 | Op: DCI.DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: VA, |
5670 | N2: DCI.DAG.getConstant(Val: I, DL, VT: MVT::i32)), |
5671 | DL, VT: MVT::i32); |
5672 | SDValue EB = DCI.DAG.getAnyExtOrTrunc( |
5673 | Op: DCI.DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: VB, |
5674 | N2: DCI.DAG.getConstant(Val: I, DL, VT: MVT::i32)), |
5675 | DL, VT: MVT::i32); |
5676 | E.push_back(Elt: DCI.DAG.getAnyExtOrTrunc( |
5677 | Op: DCI.DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: C, N2: EA, N3: EB), DL, VT: MVT::i8)); |
5678 | } |
5679 | return DCI.DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v4i8, Ops: E); |
5680 | } |
5681 | |
5682 | static SDValue |
5683 | PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { |
5684 | auto VT = N->getValueType(ResNo: 0); |
5685 | if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT)) |
5686 | return SDValue(); |
5687 | |
5688 | auto Op0 = N->getOperand(Num: 0); |
5689 | auto Op1 = N->getOperand(Num: 1); |
5690 | |
5691 | // Start out by assuming we want to take the lower 2 bytes of each i32 |
5692 | // operand. |
5693 | uint64_t Op0Bytes = 0x10; |
5694 | uint64_t Op1Bytes = 0x54; |
5695 | |
5696 | std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes}, |
5697 | {&Op1, &Op1Bytes}}; |
5698 | |
5699 | // Check that each operand is an i16, truncated from an i32 operand. We'll |
5700 | // select individual bytes from those original operands. Optionally, fold in a |
5701 | // shift right of that original operand. |
5702 | for (auto &[Op, OpBytes] : OpData) { |
5703 | // Eat up any bitcast |
5704 | if (Op->getOpcode() == ISD::BITCAST) |
5705 | *Op = Op->getOperand(i: 0); |
5706 | |
5707 | if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE && |
5708 | Op->getOperand(i: 0).getValueType() == MVT::i32)) |
5709 | return SDValue(); |
5710 | |
5711 | // If the truncate has multiple uses, this optimization can increase |
5712 | // register pressure |
5713 | if (!Op->hasOneUse()) |
5714 | return SDValue(); |
5715 | |
5716 | *Op = Op->getOperand(i: 0); |
5717 | |
5718 | // Optionally, fold in a shift-right of the original operand and let permute |
5719 | // pick the two higher bytes of the original value directly. |
5720 | if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Val: Op->getOperand(i: 1))) { |
5721 | if (cast<ConstantSDNode>(Val: Op->getOperand(i: 1))->getZExtValue() == 16) { |
5722 | // Shift the PRMT byte selector to pick upper bytes from each respective |
5723 | // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76 |
5724 | assert((*OpBytes == 0x10 || *OpBytes == 0x54) && |
5725 | "PRMT selector values out of range" ); |
5726 | *OpBytes += 0x22; |
5727 | *Op = Op->getOperand(i: 0); |
5728 | } |
5729 | } |
5730 | } |
5731 | |
5732 | SDLoc DL(N); |
5733 | auto &DAG = DCI.DAG; |
5734 | |
5735 | auto PRMT = DAG.getNode( |
5736 | Opcode: NVPTXISD::PRMT, DL, VT: MVT::v4i8, |
5737 | Ops: {Op0, Op1, DAG.getConstant(Val: (Op1Bytes << 8) | Op0Bytes, DL, VT: MVT::i32), |
5738 | DAG.getConstant(Val: NVPTX::PTXPrmtMode::NONE, DL, VT: MVT::i32)}); |
5739 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: PRMT); |
5740 | } |
5741 | |
5742 | static SDValue combineADDRSPACECAST(SDNode *N, |
5743 | TargetLowering::DAGCombinerInfo &DCI) { |
5744 | auto *ASCN1 = cast<AddrSpaceCastSDNode>(Val: N); |
5745 | |
5746 | if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(Val: ASCN1->getOperand(Num: 0))) { |
5747 | assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace()); |
5748 | |
5749 | // Fold asc[B -> A](asc[A -> B](x)) -> x |
5750 | if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace()) |
5751 | return ASCN2->getOperand(Num: 0); |
5752 | } |
5753 | |
5754 | return SDValue(); |
5755 | } |
5756 | |
5757 | SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, |
5758 | DAGCombinerInfo &DCI) const { |
5759 | CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); |
5760 | switch (N->getOpcode()) { |
5761 | default: break; |
5762 | case ISD::ADD: |
5763 | return PerformADDCombine(N, DCI, OptLevel); |
5764 | case ISD::FADD: |
5765 | return PerformFADDCombine(N, DCI, OptLevel); |
5766 | case ISD::MUL: |
5767 | return PerformMULCombine(N, DCI, OptLevel); |
5768 | case ISD::SHL: |
5769 | return PerformSHLCombine(N, DCI, OptLevel); |
5770 | case ISD::AND: |
5771 | return PerformANDCombine(N, DCI); |
5772 | case ISD::UREM: |
5773 | case ISD::SREM: |
5774 | return PerformREMCombine(N, DCI, OptLevel); |
5775 | case ISD::SETCC: |
5776 | return PerformSETCCCombine(N, DCI, SmVersion: STI.getSmVersion()); |
5777 | case ISD::LOAD: |
5778 | case NVPTXISD::LoadParamV2: |
5779 | case NVPTXISD::LoadV2: |
5780 | case NVPTXISD::LoadV4: |
5781 | return combineUnpackingMovIntoLoad(N, DCI); |
5782 | case NVPTXISD::StoreParam: |
5783 | case NVPTXISD::StoreParamV2: |
5784 | case NVPTXISD::StoreParamV4: |
5785 | return PerformStoreParamCombine(N, DCI); |
5786 | case ISD::STORE: |
5787 | case NVPTXISD::StoreV2: |
5788 | case NVPTXISD::StoreV4: |
5789 | return PerformStoreCombine(N, DCI); |
5790 | case ISD::EXTRACT_VECTOR_ELT: |
5791 | return PerformEXTRACTCombine(N, DCI); |
5792 | case ISD::VSELECT: |
5793 | return PerformVSELECTCombine(N, DCI); |
5794 | case ISD::BUILD_VECTOR: |
5795 | return PerformBUILD_VECTORCombine(N, DCI); |
5796 | case ISD::ADDRSPACECAST: |
5797 | return combineADDRSPACECAST(N, DCI); |
5798 | } |
5799 | return SDValue(); |
5800 | } |
5801 | |
5802 | static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, |
5803 | SmallVectorImpl<SDValue> &Results) { |
5804 | // Handle bitcasting to v2i8 without hitting the default promotion |
5805 | // strategy which goes through stack memory. |
5806 | SDValue Op(Node, 0); |
5807 | EVT ToVT = Op->getValueType(ResNo: 0); |
5808 | if (ToVT != MVT::v2i8) { |
5809 | return; |
5810 | } |
5811 | |
5812 | // Bitcast to i16 and unpack elements into a vector |
5813 | SDLoc DL(Node); |
5814 | SDValue AsInt = DAG.getBitcast(VT: MVT::i16, V: Op->getOperand(Num: 0)); |
5815 | SDValue Vec0 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, Operand: AsInt); |
5816 | SDValue Const8 = DAG.getConstant(Val: 8, DL, VT: MVT::i16); |
5817 | SDValue Vec1 = |
5818 | DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, |
5819 | Operand: DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i16, Ops: {AsInt, Const8})); |
5820 | Results.push_back( |
5821 | Elt: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2i8, Ops: {Vec0, Vec1})); |
5822 | } |
5823 | |
5824 | /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. |
5825 | static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, |
5826 | SmallVectorImpl<SDValue> &Results, |
5827 | const NVPTXSubtarget &STI) { |
5828 | LoadSDNode *LD = cast<LoadSDNode>(Val: N); |
5829 | const EVT ResVT = LD->getValueType(ResNo: 0); |
5830 | const EVT MemVT = LD->getMemoryVT(); |
5831 | |
5832 | // If we're doing sign/zero extension as part of the load, avoid lowering to |
5833 | // a LoadV node. TODO: consider relaxing this restriction. |
5834 | if (ResVT != MemVT) |
5835 | return; |
5836 | |
5837 | const auto NumEltsAndEltVT = getVectorLoweringShape( |
5838 | VectorEVT: ResVT, CanLowerTo256Bit: STI.has256BitVectorLoadStore(AS: LD->getAddressSpace())); |
5839 | if (!NumEltsAndEltVT) |
5840 | return; |
5841 | const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); |
5842 | |
5843 | Align Alignment = LD->getAlign(); |
5844 | const auto &TD = DAG.getDataLayout(); |
5845 | Align PrefAlign = TD.getPrefTypeAlign(Ty: MemVT.getTypeForEVT(Context&: *DAG.getContext())); |
5846 | if (Alignment < PrefAlign) { |
5847 | // This load is not sufficiently aligned, so bail out and let this vector |
5848 | // load be scalarized. Note that we may still be able to emit smaller |
5849 | // vector loads. For example, if we are loading a <4 x float> with an |
5850 | // alignment of 8, this check will fail but the legalizer will try again |
5851 | // with 2 x <2 x float>, which will succeed with an alignment of 8. |
5852 | return; |
5853 | } |
5854 | |
5855 | // Since LoadV2 is a target node, we cannot rely on DAG type legalization. |
5856 | // Therefore, we must ensure the type is legal. For i1 and i8, we set the |
5857 | // loaded type to i16 and propagate the "real" type as the memory type. |
5858 | const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT; |
5859 | |
5860 | unsigned Opcode; |
5861 | switch (NumElts) { |
5862 | default: |
5863 | return; |
5864 | case 2: |
5865 | Opcode = NVPTXISD::LoadV2; |
5866 | break; |
5867 | case 4: |
5868 | Opcode = NVPTXISD::LoadV4; |
5869 | break; |
5870 | case 8: |
5871 | Opcode = NVPTXISD::LoadV8; |
5872 | break; |
5873 | } |
5874 | auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT); |
5875 | ListVTs.push_back(Elt: MVT::Other); |
5876 | SDVTList LdResVTs = DAG.getVTList(VTs: ListVTs); |
5877 | |
5878 | SDLoc DL(LD); |
5879 | |
5880 | // Copy regular operands |
5881 | SmallVector<SDValue, 8> OtherOps(LD->ops()); |
5882 | |
5883 | // The select routine does not have access to the LoadSDNode instance, so |
5884 | // pass along the extension information |
5885 | OtherOps.push_back(Elt: DAG.getIntPtrConstant(Val: LD->getExtensionType(), DL)); |
5886 | |
5887 | SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: LdResVTs, Ops: OtherOps, |
5888 | MemVT: LD->getMemoryVT(), |
5889 | MMO: LD->getMemOperand()); |
5890 | |
5891 | SmallVector<SDValue> ScalarRes; |
5892 | if (EltVT.isVector()) { |
5893 | assert(EVT(EltVT.getVectorElementType()) == ResVT.getVectorElementType()); |
5894 | assert(NumElts * EltVT.getVectorNumElements() == |
5895 | ResVT.getVectorNumElements()); |
5896 | // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back |
5897 | // into individual elements. |
5898 | for (const unsigned I : llvm::seq(Size: NumElts)) { |
5899 | SDValue SubVector = NewLD.getValue(R: I); |
5900 | DAG.ExtractVectorElements(Op: SubVector, Args&: ScalarRes); |
5901 | } |
5902 | } else { |
5903 | for (const unsigned I : llvm::seq(Size: NumElts)) { |
5904 | SDValue Res = NewLD.getValue(R: I); |
5905 | if (LoadEltVT != EltVT) |
5906 | Res = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: EltVT, Operand: Res); |
5907 | ScalarRes.push_back(Elt: Res); |
5908 | } |
5909 | } |
5910 | |
5911 | SDValue LoadChain = NewLD.getValue(R: NumElts); |
5912 | |
5913 | const MVT BuildVecVT = |
5914 | MVT::getVectorVT(VT: EltVT.getScalarType(), NumElements: ScalarRes.size()); |
5915 | SDValue BuildVec = DAG.getBuildVector(VT: BuildVecVT, DL, Ops: ScalarRes); |
5916 | SDValue LoadValue = DAG.getBitcast(VT: ResVT, V: BuildVec); |
5917 | |
5918 | Results.append(IL: {LoadValue, LoadChain}); |
5919 | } |
5920 | |
5921 | // Lower vector return type of tcgen05.ld intrinsics |
5922 | static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG, |
5923 | SmallVectorImpl<SDValue> &Results, |
5924 | bool hasOffset = false) { |
5925 | SDLoc DL(N); |
5926 | EVT ResVT = N->getValueType(ResNo: 0); |
5927 | if (!ResVT.isVector()) |
5928 | return; // already legalized. |
5929 | |
5930 | const unsigned NumElts = ResVT.getVectorNumElements(); |
5931 | |
5932 | // Create the return type of the instructions |
5933 | SmallVector<EVT, 5> ListVTs; |
5934 | for (unsigned i = 0; i < NumElts; ++i) |
5935 | ListVTs.push_back(Elt: MVT::i32); |
5936 | |
5937 | ListVTs.push_back(Elt: N->getValueType(ResNo: 1)); // Chain |
5938 | |
5939 | SDVTList ResVTs = DAG.getVTList(VTs: ListVTs); |
5940 | |
5941 | SmallVector<SDValue, 8> Ops{N->getOperand(Num: 0), N->getOperand(Num: 1), |
5942 | N->getOperand(Num: 2)}; |
5943 | |
5944 | if (hasOffset) { |
5945 | Ops.push_back(Elt: N->getOperand(Num: 3)); // offset |
5946 | Ops.push_back(Elt: N->getOperand(Num: 4)); // Pack flag |
5947 | } else |
5948 | Ops.push_back(Elt: N->getOperand(Num: 3)); // Pack flag |
5949 | |
5950 | MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(Val: N); |
5951 | SDValue NewNode = |
5952 | DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl: DL, VTList: ResVTs, Ops, |
5953 | MemVT: MemSD->getMemoryVT(), MMO: MemSD->getMemOperand()); |
5954 | |
5955 | // split the vector result |
5956 | SmallVector<SDValue, 4> ScalarRes; |
5957 | for (unsigned i = 0; i < NumElts; ++i) { |
5958 | SDValue Res = NewNode.getValue(R: i); |
5959 | ScalarRes.push_back(Elt: Res); |
5960 | } |
5961 | |
5962 | SDValue Chain = NewNode.getValue(R: NumElts); |
5963 | SDValue BuildVector = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: ResVT, Ops: ScalarRes); |
5964 | Results.push_back(Elt: BuildVector); // Build Vector |
5965 | Results.push_back(Elt: Chain); // Chain |
5966 | } |
5967 | |
5968 | static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, |
5969 | SmallVectorImpl<SDValue> &Results) { |
5970 | SDValue Chain = N->getOperand(Num: 0); |
5971 | SDValue Intrin = N->getOperand(Num: 1); |
5972 | SDLoc DL(N); |
5973 | |
5974 | // Get the intrinsic ID |
5975 | unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); |
5976 | switch (IntrinNo) { |
5977 | default: |
5978 | return; |
5979 | case Intrinsic::nvvm_ldu_global_i: |
5980 | case Intrinsic::nvvm_ldu_global_f: |
5981 | case Intrinsic::nvvm_ldu_global_p: { |
5982 | EVT ResVT = N->getValueType(ResNo: 0); |
5983 | |
5984 | if (ResVT.isVector()) { |
5985 | // Vector LDG/LDU |
5986 | |
5987 | unsigned NumElts = ResVT.getVectorNumElements(); |
5988 | EVT EltVT = ResVT.getVectorElementType(); |
5989 | |
5990 | // Since LDU/LDG are target nodes, we cannot rely on DAG type |
5991 | // legalization. |
5992 | // Therefore, we must ensure the type is legal. For i1 and i8, we set the |
5993 | // loaded type to i16 and propagate the "real" type as the memory type. |
5994 | bool NeedTrunc = false; |
5995 | if (EltVT.getSizeInBits() < 16) { |
5996 | EltVT = MVT::i16; |
5997 | NeedTrunc = true; |
5998 | } |
5999 | |
6000 | unsigned Opcode = 0; |
6001 | SDVTList LdResVTs; |
6002 | |
6003 | switch (NumElts) { |
6004 | default: |
6005 | return; |
6006 | case 2: |
6007 | Opcode = NVPTXISD::LDUV2; |
6008 | LdResVTs = DAG.getVTList(VT1: EltVT, VT2: EltVT, VT3: MVT::Other); |
6009 | break; |
6010 | case 4: { |
6011 | Opcode = NVPTXISD::LDUV4; |
6012 | EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; |
6013 | LdResVTs = DAG.getVTList(VTs: ListVTs); |
6014 | break; |
6015 | } |
6016 | } |
6017 | |
6018 | SmallVector<SDValue, 8> OtherOps; |
6019 | |
6020 | // Copy regular operands |
6021 | |
6022 | OtherOps.push_back(Elt: Chain); // Chain |
6023 | // Skip operand 1 (intrinsic ID) |
6024 | // Others |
6025 | OtherOps.append(in_start: N->op_begin() + 2, in_end: N->op_end()); |
6026 | |
6027 | MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(Val: N); |
6028 | |
6029 | SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: LdResVTs, Ops: OtherOps, |
6030 | MemVT: MemSD->getMemoryVT(), |
6031 | MMO: MemSD->getMemOperand()); |
6032 | |
6033 | SmallVector<SDValue, 4> ScalarRes; |
6034 | |
6035 | for (unsigned i = 0; i < NumElts; ++i) { |
6036 | SDValue Res = NewLD.getValue(R: i); |
6037 | if (NeedTrunc) |
6038 | Res = |
6039 | DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT.getVectorElementType(), Operand: Res); |
6040 | ScalarRes.push_back(Elt: Res); |
6041 | } |
6042 | |
6043 | SDValue LoadChain = NewLD.getValue(R: NumElts); |
6044 | |
6045 | SDValue BuildVec = |
6046 | DAG.getBuildVector(VT: ResVT, DL, Ops: ScalarRes); |
6047 | |
6048 | Results.push_back(Elt: BuildVec); |
6049 | Results.push_back(Elt: LoadChain); |
6050 | } else { |
6051 | // i8 LDG/LDU |
6052 | assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && |
6053 | "Custom handling of non-i8 ldu/ldg?" ); |
6054 | |
6055 | // Just copy all operands as-is |
6056 | SmallVector<SDValue, 4> Ops(N->ops()); |
6057 | |
6058 | // Force output to i16 |
6059 | SDVTList LdResVTs = DAG.getVTList(VT1: MVT::i16, VT2: MVT::Other); |
6060 | |
6061 | MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(Val: N); |
6062 | |
6063 | // We make sure the memory type is i8, which will be used during isel |
6064 | // to select the proper instruction. |
6065 | SDValue NewLD = |
6066 | DAG.getMemIntrinsicNode(Opcode: ISD::INTRINSIC_W_CHAIN, dl: DL, VTList: LdResVTs, Ops, |
6067 | MemVT: MVT::i8, MMO: MemSD->getMemOperand()); |
6068 | |
6069 | Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i8, |
6070 | Operand: NewLD.getValue(R: 0))); |
6071 | Results.push_back(Elt: NewLD.getValue(R: 1)); |
6072 | } |
6073 | return; |
6074 | } |
6075 | |
6076 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: |
6077 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x4: |
6078 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x8: |
6079 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x16: |
6080 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x32: |
6081 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x64: |
6082 | case Intrinsic::nvvm_tcgen05_ld_16x64b_x128: |
6083 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x2: |
6084 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x4: |
6085 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x8: |
6086 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x16: |
6087 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x32: |
6088 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x64: |
6089 | case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: |
6090 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x1: |
6091 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x2: |
6092 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x4: |
6093 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x8: |
6094 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x16: |
6095 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x32: |
6096 | case Intrinsic::nvvm_tcgen05_ld_16x128b_x64: |
6097 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x1: |
6098 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x2: |
6099 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x4: |
6100 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x8: |
6101 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x16: |
6102 | case Intrinsic::nvvm_tcgen05_ld_16x256b_x32: |
6103 | return ReplaceTcgen05Ld(N, DAG, Results); |
6104 | |
6105 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: |
6106 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: |
6107 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: |
6108 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: |
6109 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: |
6110 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: |
6111 | case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: |
6112 | return ReplaceTcgen05Ld(N, DAG, Results, /* Offset */ hasOffset: true); |
6113 | } |
6114 | } |
6115 | |
6116 | static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, |
6117 | SmallVectorImpl<SDValue> &Results) { |
6118 | // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit |
6119 | // result so that it can pass the legalization |
6120 | SDLoc DL(N); |
6121 | SDValue Chain = N->getOperand(Num: 0); |
6122 | SDValue Reg = N->getOperand(Num: 1); |
6123 | SDValue Glue = N->getOperand(Num: 2); |
6124 | |
6125 | assert(Reg.getValueType() == MVT::i128 && |
6126 | "Custom lowering for CopyFromReg with 128-bit reg only" ); |
6127 | SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(ResNo: 1), |
6128 | N->getValueType(ResNo: 2)}; |
6129 | SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue}; |
6130 | |
6131 | SDValue NewValue = DAG.getNode(Opcode: ISD::CopyFromReg, DL, ResultTys: ResultsType, Ops: NewOps); |
6132 | SDValue Pair = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128, |
6133 | Ops: {NewValue.getValue(R: 0), NewValue.getValue(R: 1)}); |
6134 | |
6135 | Results.push_back(Elt: Pair); |
6136 | Results.push_back(Elt: NewValue.getValue(R: 2)); |
6137 | Results.push_back(Elt: NewValue.getValue(R: 3)); |
6138 | } |
6139 | |
6140 | void NVPTXTargetLowering::ReplaceNodeResults( |
6141 | SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { |
6142 | switch (N->getOpcode()) { |
6143 | default: |
6144 | report_fatal_error(reason: "Unhandled custom legalization" ); |
6145 | case ISD::BITCAST: |
6146 | ReplaceBITCAST(Node: N, DAG, Results); |
6147 | return; |
6148 | case ISD::LOAD: |
6149 | ReplaceLoadVector(N, DAG, Results, STI); |
6150 | return; |
6151 | case ISD::INTRINSIC_W_CHAIN: |
6152 | ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); |
6153 | return; |
6154 | case ISD::CopyFromReg: |
6155 | ReplaceCopyFromReg_128(N, DAG, Results); |
6156 | return; |
6157 | } |
6158 | } |
6159 | |
6160 | NVPTXTargetLowering::AtomicExpansionKind |
6161 | NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { |
6162 | Type *Ty = AI->getValOperand()->getType(); |
6163 | |
6164 | if (AI->isFloatingPointOperation()) { |
6165 | if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { |
6166 | if (Ty->isHalfTy() && STI.getSmVersion() >= 70 && |
6167 | STI.getPTXVersion() >= 63) |
6168 | return AtomicExpansionKind::None; |
6169 | if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 && |
6170 | STI.getPTXVersion() >= 78) |
6171 | return AtomicExpansionKind::None; |
6172 | if (Ty->isFloatTy()) |
6173 | return AtomicExpansionKind::None; |
6174 | if (Ty->isDoubleTy() && STI.hasAtomAddF64()) |
6175 | return AtomicExpansionKind::None; |
6176 | } |
6177 | return AtomicExpansionKind::CmpXChg; |
6178 | } |
6179 | |
6180 | assert(Ty->isIntegerTy() && "Ty should be integer at this point" ); |
6181 | auto ITy = cast<llvm::IntegerType>(Val: Ty); |
6182 | |
6183 | switch (AI->getOperation()) { |
6184 | default: |
6185 | return AtomicExpansionKind::CmpXChg; |
6186 | case AtomicRMWInst::BinOp::And: |
6187 | case AtomicRMWInst::BinOp::Or: |
6188 | case AtomicRMWInst::BinOp::Xor: |
6189 | case AtomicRMWInst::BinOp::Xchg: |
6190 | switch (ITy->getBitWidth()) { |
6191 | case 8: |
6192 | case 16: |
6193 | return AtomicExpansionKind::CmpXChg; |
6194 | case 32: |
6195 | return AtomicExpansionKind::None; |
6196 | case 64: |
6197 | if (STI.hasAtomBitwise64()) |
6198 | return AtomicExpansionKind::None; |
6199 | return AtomicExpansionKind::CmpXChg; |
6200 | default: |
6201 | llvm_unreachable("unsupported width encountered" ); |
6202 | } |
6203 | case AtomicRMWInst::BinOp::Add: |
6204 | case AtomicRMWInst::BinOp::Sub: |
6205 | case AtomicRMWInst::BinOp::Max: |
6206 | case AtomicRMWInst::BinOp::Min: |
6207 | case AtomicRMWInst::BinOp::UMax: |
6208 | case AtomicRMWInst::BinOp::UMin: |
6209 | switch (ITy->getBitWidth()) { |
6210 | case 8: |
6211 | case 16: |
6212 | return AtomicExpansionKind::CmpXChg; |
6213 | case 32: |
6214 | return AtomicExpansionKind::None; |
6215 | case 64: |
6216 | if (STI.hasAtomMinMax64()) |
6217 | return AtomicExpansionKind::None; |
6218 | return AtomicExpansionKind::CmpXChg; |
6219 | default: |
6220 | llvm_unreachable("unsupported width encountered" ); |
6221 | } |
6222 | case AtomicRMWInst::BinOp::UIncWrap: |
6223 | case AtomicRMWInst::BinOp::UDecWrap: |
6224 | switch (ITy->getBitWidth()) { |
6225 | case 32: |
6226 | return AtomicExpansionKind::None; |
6227 | case 8: |
6228 | case 16: |
6229 | case 64: |
6230 | return AtomicExpansionKind::CmpXChg; |
6231 | default: |
6232 | llvm_unreachable("unsupported width encountered" ); |
6233 | } |
6234 | } |
6235 | |
6236 | return AtomicExpansionKind::CmpXChg; |
6237 | } |
6238 | |
6239 | bool NVPTXTargetLowering::shouldInsertFencesForAtomic( |
6240 | const Instruction *I) const { |
6241 | auto *CI = dyn_cast<AtomicCmpXchgInst>(Val: I); |
6242 | // When CAS bitwidth is not supported on the hardware, the CAS is emulated |
6243 | // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce |
6244 | // the memory order using explicit fences around the retry loop. |
6245 | // The memory order of natively supported CAS operations can be enforced |
6246 | // by lowering to an atom.cas with the right memory synchronizing effect. |
6247 | // However, atom.cas only supports relaxed, acquire, release and acq_rel. |
6248 | // So we also use explicit fences for enforcing memory order for |
6249 | // seq_cast CAS with natively-supported bitwidths. |
6250 | return CI && |
6251 | (cast<IntegerType>(Val: CI->getCompareOperand()->getType())->getBitWidth() < |
6252 | STI.getMinCmpXchgSizeInBits() || |
6253 | CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent); |
6254 | } |
6255 | |
6256 | AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit( |
6257 | const Instruction *I) const { |
6258 | auto *CI = dyn_cast<AtomicCmpXchgInst>(Val: I); |
6259 | bool BitwidthSupportedAndIsSeqCst = |
6260 | CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent && |
6261 | cast<IntegerType>(Val: CI->getCompareOperand()->getType())->getBitWidth() >= |
6262 | STI.getMinCmpXchgSizeInBits(); |
6263 | return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire |
6264 | : AtomicOrdering::Monotonic; |
6265 | } |
6266 | |
6267 | Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, |
6268 | Instruction *Inst, |
6269 | AtomicOrdering Ord) const { |
6270 | if (!isa<AtomicCmpXchgInst>(Val: Inst)) |
6271 | return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord); |
6272 | |
6273 | // Specialize for cmpxchg |
6274 | // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated |
6275 | if (isReleaseOrStronger(AO: Ord)) |
6276 | return Ord == AtomicOrdering::SequentiallyConsistent |
6277 | ? Builder.CreateFence(Ordering: AtomicOrdering::SequentiallyConsistent) |
6278 | : Builder.CreateFence(Ordering: AtomicOrdering::Release); |
6279 | |
6280 | return nullptr; |
6281 | } |
6282 | |
6283 | Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, |
6284 | Instruction *Inst, |
6285 | AtomicOrdering Ord) const { |
6286 | // Specialize for cmpxchg |
6287 | if (!isa<AtomicCmpXchgInst>(Val: Inst)) |
6288 | return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); |
6289 | |
6290 | auto CASWidth = |
6291 | cast<IntegerType>( |
6292 | Val: dyn_cast<AtomicCmpXchgInst>(Val: Inst)->getCompareOperand()->getType()) |
6293 | ->getBitWidth(); |
6294 | // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated |
6295 | if (isAcquireOrStronger(AO: Ord) && |
6296 | (Ord != AtomicOrdering::SequentiallyConsistent || |
6297 | CASWidth < STI.getMinCmpXchgSizeInBits())) |
6298 | return Builder.CreateFence(Ordering: AtomicOrdering::Acquire); |
6299 | |
6300 | return nullptr; |
6301 | } |
6302 | |
6303 | // Rather than default to SINT when both UINT and SINT are custom, we only |
6304 | // change the opcode when UINT is not legal and SINT is. UINT is preferred when |
6305 | // both are custom since unsigned CVT instructions can lead to slightly better |
6306 | // SASS code with fewer instructions. |
6307 | unsigned NVPTXTargetLowering::getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, |
6308 | EVT ToVT) const { |
6309 | if (isOperationLegal(Op, VT: ToVT)) |
6310 | return Op; |
6311 | switch (Op) { |
6312 | case ISD::FP_TO_UINT: |
6313 | if (isOperationLegal(Op: ISD::FP_TO_SINT, VT: ToVT)) |
6314 | return ISD::FP_TO_SINT; |
6315 | break; |
6316 | case ISD::STRICT_FP_TO_UINT: |
6317 | if (isOperationLegal(Op: ISD::STRICT_FP_TO_SINT, VT: ToVT)) |
6318 | return ISD::STRICT_FP_TO_SINT; |
6319 | break; |
6320 | case ISD::VP_FP_TO_UINT: |
6321 | if (isOperationLegal(Op: ISD::VP_FP_TO_SINT, VT: ToVT)) |
6322 | return ISD::VP_FP_TO_SINT; |
6323 | break; |
6324 | default: |
6325 | break; |
6326 | } |
6327 | return Op; |
6328 | } |
6329 | |
6330 | // Pin NVPTXTargetObjectFile's vtables to this file. |
6331 | NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; |
6332 | |
6333 | MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( |
6334 | const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { |
6335 | return getDataSection(); |
6336 | } |
6337 | |