1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTX.h"
15#include "NVPTXUtilities.h"
16#include "llvm/ADT/APInt.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/CodeGen/ISDOpcodes.h"
19#include "llvm/CodeGen/SelectionDAG.h"
20#include "llvm/CodeGen/SelectionDAGNodes.h"
21#include "llvm/IR/GlobalValue.h"
22#include "llvm/IR/Instructions.h"
23#include "llvm/IR/IntrinsicsNVPTX.h"
24#include "llvm/IR/NVVMIntrinsicUtils.h"
25#include "llvm/Support/AtomicOrdering.h"
26#include "llvm/Support/CommandLine.h"
27#include "llvm/Support/ErrorHandling.h"
28#include "llvm/Support/FormatVariadic.h"
29#include "llvm/Support/MathExtras.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "nvptx-isel"
35#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
36
37static cl::opt<bool>
38 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(Val: true), cl::Hidden,
39 cl::desc("Enable reciprocal sqrt optimization"));
40
41/// createNVPTXISelDag - This pass converts a legalized DAG into a
42/// NVPTX-specific DAG, ready for instruction scheduling.
43FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
44 llvm::CodeGenOptLevel OptLevel) {
45 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
46}
47
48NVPTXDAGToDAGISelLegacy::NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm,
49 CodeGenOptLevel OptLevel)
50 : SelectionDAGISelLegacy(
51 ID, std::make_unique<NVPTXDAGToDAGISel>(args&: tm, args&: OptLevel)) {}
52
53char NVPTXDAGToDAGISelLegacy::ID = 0;
54
55INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
56
57NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
58 CodeGenOptLevel OptLevel)
59 : SelectionDAGISel(tm, OptLevel), TM(tm) {
60 doMulWide = (OptLevel > CodeGenOptLevel::None);
61}
62
63bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
64 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
65 Scopes = NVPTXScopes(MF.getFunction().getContext());
66 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
67}
68
69NVPTX::DivPrecisionLevel
70NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
71 return Subtarget->getTargetLowering()->getDivF32Level(MF: *MF, N: *N);
72}
73
74bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
75 return Subtarget->getTargetLowering()->usePrecSqrtF32(MF: *MF, N);
76}
77
78bool NVPTXDAGToDAGISel::useF32FTZ() const {
79 return Subtarget->getTargetLowering()->useF32FTZ(MF: *MF);
80}
81
82bool NVPTXDAGToDAGISel::allowFMA() const {
83 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
84 return TL->allowFMA(MF&: *MF, OptLevel);
85}
86
87bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
88 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
89 return TL->allowUnsafeFPMath(MF: *MF);
90}
91
92bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
93
94/// Select - Select instructions not customized! Used for
95/// expanded, promoted and normal instructions.
96void NVPTXDAGToDAGISel::Select(SDNode *N) {
97
98 if (N->isMachineOpcode()) {
99 N->setNodeId(-1);
100 return; // Already selected.
101 }
102
103 switch (N->getOpcode()) {
104 case ISD::LOAD:
105 case ISD::ATOMIC_LOAD:
106 if (tryLoad(N))
107 return;
108 break;
109 case ISD::STORE:
110 case ISD::ATOMIC_STORE:
111 if (tryStore(N))
112 return;
113 break;
114 case ISD::ATOMIC_FENCE:
115 if (tryFence(N))
116 return;
117 break;
118 case NVPTXISD::UNPACK_VECTOR:
119 tryUNPACK_VECTOR(N);
120 return;
121 case ISD::EXTRACT_VECTOR_ELT:
122 if (tryEXTRACT_VECTOR_ELEMENT(N))
123 return;
124 break;
125 case NVPTXISD::SETP_F16X2:
126 SelectSETP_F16X2(N);
127 return;
128 case NVPTXISD::SETP_BF16X2:
129 SelectSETP_BF16X2(N);
130 return;
131 case NVPTXISD::LoadV2:
132 case NVPTXISD::LoadV4:
133 case NVPTXISD::LoadV8:
134 if (tryLoadVector(N))
135 return;
136 break;
137 case NVPTXISD::LDUV2:
138 case NVPTXISD::LDUV4:
139 if (tryLDU(N))
140 return;
141 break;
142 case NVPTXISD::StoreV2:
143 case NVPTXISD::StoreV4:
144 case NVPTXISD::StoreV8:
145 if (tryStoreVector(N))
146 return;
147 break;
148 case NVPTXISD::LoadParam:
149 case NVPTXISD::LoadParamV2:
150 case NVPTXISD::LoadParamV4:
151 if (tryLoadParam(N))
152 return;
153 break;
154 case NVPTXISD::StoreParam:
155 case NVPTXISD::StoreParamV2:
156 case NVPTXISD::StoreParamV4:
157 if (tryStoreParam(N))
158 return;
159 break;
160 case ISD::INTRINSIC_W_CHAIN:
161 if (tryIntrinsicChain(N))
162 return;
163 break;
164 case ISD::INTRINSIC_VOID:
165 if (tryIntrinsicVoid(N))
166 return;
167 break;
168 case ISD::AND:
169 case ISD::SRA:
170 case ISD::SRL:
171 // Try to select BFE
172 if (tryBFE(N))
173 return;
174 break;
175 case ISD::ADDRSPACECAST:
176 SelectAddrSpaceCast(N);
177 return;
178 case ISD::CopyToReg: {
179 if (N->getOperand(Num: 1).getValueType() == MVT::i128) {
180 SelectV2I64toI128(N);
181 return;
182 }
183 break;
184 }
185 case ISD::CopyFromReg: {
186 if (N->getOperand(Num: 1).getValueType() == MVT::i128) {
187 SelectI128toV2I64(N);
188 return;
189 }
190 break;
191 }
192 case ISD::FADD:
193 case ISD::FMUL:
194 case ISD::FSUB:
195 if (tryBF16ArithToFMA(N))
196 return;
197 break;
198 default:
199 break;
200 }
201 SelectCode(N);
202}
203
204#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
205 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
206 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
207
208static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
209 switch (IID) {
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
211 return TCGEN05_LD_OPCODE(16x64b, x1);
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
213 return TCGEN05_LD_OPCODE(16x64b, x2);
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
215 return TCGEN05_LD_OPCODE(16x64b, x4);
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
217 return TCGEN05_LD_OPCODE(16x64b, x8);
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
219 return TCGEN05_LD_OPCODE(16x64b, x16);
220 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
221 return TCGEN05_LD_OPCODE(16x64b, x32);
222 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
223 return TCGEN05_LD_OPCODE(16x64b, x64);
224 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
225 return TCGEN05_LD_OPCODE(16x64b, x128);
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
227 return TCGEN05_LD_OPCODE(16x128b, x1);
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
229 return TCGEN05_LD_OPCODE(16x128b, x2);
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
231 return TCGEN05_LD_OPCODE(16x128b, x4);
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
233 return TCGEN05_LD_OPCODE(16x128b, x8);
234 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
235 return TCGEN05_LD_OPCODE(16x128b, x16);
236 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
237 return TCGEN05_LD_OPCODE(16x128b, x32);
238 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
239 return TCGEN05_LD_OPCODE(16x128b, x64);
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
241 return TCGEN05_LD_OPCODE(16x256b, x1);
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
243 return TCGEN05_LD_OPCODE(16x256b, x2);
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
245 return TCGEN05_LD_OPCODE(16x256b, x4);
246 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
247 return TCGEN05_LD_OPCODE(16x256b, x8);
248 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
249 return TCGEN05_LD_OPCODE(16x256b, x16);
250 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
251 return TCGEN05_LD_OPCODE(16x256b, x32);
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
253 return TCGEN05_LD_OPCODE(16x32bx2, x1);
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
255 return TCGEN05_LD_OPCODE(16x32bx2, x2);
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
257 return TCGEN05_LD_OPCODE(16x32bx2, x4);
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
259 return TCGEN05_LD_OPCODE(16x32bx2, x8);
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
261 return TCGEN05_LD_OPCODE(16x32bx2, x16);
262 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
263 return TCGEN05_LD_OPCODE(16x32bx2, x32);
264 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
265 return TCGEN05_LD_OPCODE(16x32bx2, x64);
266 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
267 return TCGEN05_LD_OPCODE(16x32bx2, x128);
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
269 return TCGEN05_LD_OPCODE(32x32b, x1);
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
271 return TCGEN05_LD_OPCODE(32x32b, x2);
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
273 return TCGEN05_LD_OPCODE(32x32b, x4);
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
275 return TCGEN05_LD_OPCODE(32x32b, x8);
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
277 return TCGEN05_LD_OPCODE(32x32b, x16);
278 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
279 return TCGEN05_LD_OPCODE(32x32b, x32);
280 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
281 return TCGEN05_LD_OPCODE(32x32b, x64);
282 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
283 return TCGEN05_LD_OPCODE(32x32b, x128);
284 }
285 llvm_unreachable("unhandled tcgen05.ld lowering");
286}
287
288void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
289 SDLoc DL(N);
290 unsigned IID = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getZExtValue();
291
292 if (hasOffset) {
293 bool enablePack = cast<ConstantSDNode>(Val: N->getOperand(Num: 4))->getZExtValue();
294 auto OffsetNode = CurDAG->getTargetConstant(
295 Val: cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue(), DL, VT: MVT::i32);
296 ReplaceNode(F: N, T: CurDAG->getMachineNode(
297 Opcode: getTcgen05LdOpcode(IID, enablePack), dl: DL, VTs: N->getVTList(),
298 Ops: {N->getOperand(Num: 2), OffsetNode, N->getOperand(Num: 0)}));
299 } else {
300 bool enablePack = cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue();
301 ReplaceNode(F: N, T: CurDAG->getMachineNode(
302 Opcode: getTcgen05LdOpcode(IID, enablePack), dl: DL, VTs: N->getVTList(),
303 Ops: {N->getOperand(Num: 2), N->getOperand(Num: 0)}));
304 }
305}
306
307bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
308 unsigned IID = N->getConstantOperandVal(Num: 1);
309 switch (IID) {
310 default:
311 return false;
312 case Intrinsic::nvvm_ldu_global_f:
313 case Intrinsic::nvvm_ldu_global_i:
314 case Intrinsic::nvvm_ldu_global_p:
315 return tryLDU(N);
316
317 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
318 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
323 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
324 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
325 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
326 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
329 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
331 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
332 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
334 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
337 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
338 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
339 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
344 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
345 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
346 SelectTcgen05Ld(N);
347 return true;
348 }
349
350 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
351 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
356 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
357 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
358 SelectTcgen05Ld(N, /* hasOffset */ true);
359 return true;
360 }
361 }
362}
363
364// Map ISD:CONDCODE value to appropriate CmpMode expected by
365// NVPTXInstPrinter::printCmpMode()
366static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
367 using NVPTX::PTXCmpMode::CmpMode;
368 unsigned PTXCmpMode = [](ISD::CondCode CC) {
369 switch (CC) {
370 default:
371 llvm_unreachable("Unexpected condition code.");
372 case ISD::SETOEQ:
373 return CmpMode::EQ;
374 case ISD::SETOGT:
375 return CmpMode::GT;
376 case ISD::SETOGE:
377 return CmpMode::GE;
378 case ISD::SETOLT:
379 return CmpMode::LT;
380 case ISD::SETOLE:
381 return CmpMode::LE;
382 case ISD::SETONE:
383 return CmpMode::NE;
384 case ISD::SETO:
385 return CmpMode::NUM;
386 case ISD::SETUO:
387 return CmpMode::NotANumber;
388 case ISD::SETUEQ:
389 return CmpMode::EQU;
390 case ISD::SETUGT:
391 return CmpMode::GTU;
392 case ISD::SETUGE:
393 return CmpMode::GEU;
394 case ISD::SETULT:
395 return CmpMode::LTU;
396 case ISD::SETULE:
397 return CmpMode::LEU;
398 case ISD::SETUNE:
399 return CmpMode::NEU;
400 case ISD::SETEQ:
401 return CmpMode::EQ;
402 case ISD::SETGT:
403 return CmpMode::GT;
404 case ISD::SETGE:
405 return CmpMode::GE;
406 case ISD::SETLT:
407 return CmpMode::LT;
408 case ISD::SETLE:
409 return CmpMode::LE;
410 case ISD::SETNE:
411 return CmpMode::NE;
412 }
413 }(CondCode.get());
414
415 if (FTZ)
416 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
417
418 return PTXCmpMode;
419}
420
421bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
422 unsigned PTXCmpMode =
423 getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)), FTZ: useF32FTZ());
424 SDLoc DL(N);
425 SDNode *SetP = CurDAG->getMachineNode(
426 Opcode: NVPTX::SETP_f16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1, Op1: N->getOperand(Num: 0),
427 Op2: N->getOperand(Num: 1), Op3: CurDAG->getTargetConstant(Val: PTXCmpMode, DL, VT: MVT::i32));
428 ReplaceNode(F: N, T: SetP);
429 return true;
430}
431
432bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
433 unsigned PTXCmpMode =
434 getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)), FTZ: useF32FTZ());
435 SDLoc DL(N);
436 SDNode *SetP = CurDAG->getMachineNode(
437 Opcode: NVPTX::SETP_bf16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1, Op1: N->getOperand(Num: 0),
438 Op2: N->getOperand(Num: 1), Op3: CurDAG->getTargetConstant(Val: PTXCmpMode, DL, VT: MVT::i32));
439 ReplaceNode(F: N, T: SetP);
440 return true;
441}
442
443bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
444 SDValue Vector = N->getOperand(Num: 0);
445 MVT EltVT = N->getSimpleValueType(ResNo: 0);
446
447 MachineSDNode *N2 =
448 CurDAG->getMachineNode(Opcode: NVPTX::I64toV2I32, dl: SDLoc(N), VT1: EltVT, VT2: EltVT, Ops: Vector);
449
450 ReplaceNode(F: N, T: N2);
451 return true;
452}
453
454// Find all instances of extract_vector_elt that use this v2f16 vector
455// and coalesce them into a scattering move instruction.
456bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
457 SDValue Vector = N->getOperand(Num: 0);
458
459 // We only care about 16x2 as it's the only real vector type we
460 // need to deal with.
461 MVT VT = Vector.getSimpleValueType();
462 if (!Isv2x16VT(VT))
463 return false;
464 // Find and record all uses of this vector that extract element 0 or 1.
465 SmallVector<SDNode *, 4> E0, E1;
466 for (auto *U : Vector.getNode()->users()) {
467 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
468 continue;
469 if (U->getOperand(Num: 0) != Vector)
470 continue;
471 if (const ConstantSDNode *IdxConst =
472 dyn_cast<ConstantSDNode>(Val: U->getOperand(Num: 1))) {
473 if (IdxConst->getZExtValue() == 0)
474 E0.push_back(Elt: U);
475 else if (IdxConst->getZExtValue() == 1)
476 E1.push_back(Elt: U);
477 else
478 llvm_unreachable("Invalid vector index.");
479 }
480 }
481
482 // There's no point scattering f16x2 if we only ever access one
483 // element of it.
484 if (E0.empty() || E1.empty())
485 return false;
486
487 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
488 // into f16,f16 SplitF16x2(V)
489 MVT EltVT = VT.getVectorElementType();
490 SDNode *ScatterOp =
491 CurDAG->getMachineNode(Opcode: NVPTX::I32toV2I16, dl: SDLoc(N), VT1: EltVT, VT2: EltVT, Ops: Vector);
492 for (auto *Node : E0)
493 ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 0));
494 for (auto *Node : E1)
495 ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 1));
496
497 return true;
498}
499
500static std::optional<unsigned> convertAS(unsigned AS) {
501 switch (AS) {
502 case llvm::ADDRESS_SPACE_LOCAL:
503 return NVPTX::AddressSpace::Local;
504 case llvm::ADDRESS_SPACE_GLOBAL:
505 return NVPTX::AddressSpace::Global;
506 case llvm::ADDRESS_SPACE_SHARED:
507 return NVPTX::AddressSpace::Shared;
508 case llvm::ADDRESS_SPACE_SHARED_CLUSTER:
509 return NVPTX::AddressSpace::SharedCluster;
510 case llvm::ADDRESS_SPACE_GENERIC:
511 return NVPTX::AddressSpace::Generic;
512 case llvm::ADDRESS_SPACE_PARAM:
513 return NVPTX::AddressSpace::Param;
514 case llvm::ADDRESS_SPACE_CONST:
515 return NVPTX::AddressSpace::Const;
516 default:
517 return std::nullopt;
518 }
519}
520
521static unsigned int getCodeAddrSpace(const MemSDNode *N) {
522 return convertAS(AS: N->getMemOperand()->getAddrSpace())
523 .value_or(u: NVPTX::AddressSpace::Generic);
524}
525
526namespace {
527
528struct OperationOrderings {
529 NVPTX::Ordering InstructionOrdering, FenceOrdering;
530 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
531 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
532 : InstructionOrdering(IO), FenceOrdering(FO) {}
533};
534
535static OperationOrderings
536getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
537 AtomicOrdering Ordering = N->getSuccessOrdering();
538 auto CodeAddrSpace = getCodeAddrSpace(N);
539
540 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
541 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
542
543 // clang-format off
544
545 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
546 // Note: uses of Relaxed in the Atomic column of this table refer
547 // to LLVM AtomicOrdering::Monotonic.
548 //
549 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
550 // |---------|----------|--------------------|------------|------------------------------|
551 // | No | No | All | plain | .weak |
552 // | No | Yes | Generic,Shared, | .volatile | .volatile |
553 // | | | Global [0] | | |
554 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
555 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
556 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
557 // | | | Global [0] | | |
558 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
559 // | | | Global [0] | | |
560 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
561 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
562 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
563 // | | | | | or .volatile (PTX 8.1-) |
564 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
565 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
566 // | | | / Global [0] | | |
567
568 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
569 // by following the ABI proven sound in:
570 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
571 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
572 //
573 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
574 // |------------------------------------------------------|-------------------------------|
575 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
576 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
577 // |------------------------------------------------------|-------------------------------|
578 // | cuda::atomic_load | fence.sc.<scope>; |
579 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
580 // |------------------------------------------------------|-------------------------------|
581 // | cuda::atomic_store | fence.sc.<scope>; |
582 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
583 // |------------------------------------------------------|-------------------------------|
584 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
585 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
586
587 // clang-format on
588
589 // [0]: volatile and atomics are only supported on global or shared
590 // memory locations, accessed via generic/shared/global pointers.
591 // MMIO is only supported on global memory locations,
592 // accessed via generic/global pointers.
593 // TODO: Implement MMIO access via generic pointer to global.
594 // Currently implemented for global pointers only.
595
596 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
597 // PTX instructions fails to preserve their C++ side-effects.
598 //
599 // Example (https://github.com/llvm/llvm-project/issues/62057):
600 //
601 // void example() {
602 // std::atomic<bool> True = true;
603 // while (True.load(std::memory_order_relaxed));
604 // }
605 //
606 // A C++ program that calls "example" is well-defined: the infinite loop
607 // performs an atomic operation. By lowering volatile/atomics to
608 // "weak" memory operations, we are transforming the above into:
609 //
610 // void undefined_behavior() {
611 // bool True = true;
612 // while (True);
613 // }
614 //
615 // which exhibits undefined behavior in both C++ and PTX.
616 //
617 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
618 // behavior due to lack of Independent Forward Progress. Lowering these
619 // to weak memory operations in sm_60- is therefore fine.
620 //
621 // TODO: lower atomic and volatile operations to memory locations
622 // in local, const, and param to two PTX instructions in sm_70+:
623 // - the "weak" memory instruction we are currently lowering to, and
624 // - some other instruction that preserves the side-effect, e.g.,
625 // a dead dummy volatile load.
626 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
627 CodeAddrSpace == NVPTX::AddressSpace::Const ||
628 CodeAddrSpace == NVPTX::AddressSpace::Param) {
629 return NVPTX::Ordering::NotAtomic;
630 }
631
632 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
633 // supported on sm_60 and older; this includes volatile atomics.
634 if (!(Ordering == AtomicOrdering::NotAtomic ||
635 Ordering == AtomicOrdering::Unordered ||
636 Ordering == AtomicOrdering::Monotonic) &&
637 !HasMemoryOrdering) {
638 report_fatal_error(
639 reason: formatv(Fmt: "PTX does not support \"atomic\" for orderings different than"
640 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
641 "is: \"{}\".",
642 Vals: toIRString(ao: Ordering)));
643 }
644
645 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
646 // the volatile semantics and preserve the atomic ones.
647
648 // PTX volatile and PTX atomics are not available for statespace that differ
649 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
650 // atomics is undefined if the generic address does not refer to a .global or
651 // .shared memory location.
652 bool AddrGenericOrGlobalOrShared =
653 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
654 CodeAddrSpace == NVPTX::AddressSpace::Global ||
655 CodeAddrSpace == NVPTX::AddressSpace::Shared ||
656 CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
657 if (!AddrGenericOrGlobalOrShared)
658 return NVPTX::Ordering::NotAtomic;
659
660 bool UseRelaxedMMIO =
661 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
662
663 switch (Ordering) {
664 case AtomicOrdering::NotAtomic:
665 return N->isVolatile() ? NVPTX::Ordering::Volatile
666 : NVPTX::Ordering::NotAtomic;
667 case AtomicOrdering::Unordered:
668 // We lower unordered in the exact same way as 'monotonic' to respect
669 // LLVM IR atomicity requirements.
670 case AtomicOrdering::Monotonic:
671 if (N->isVolatile())
672 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
673 : NVPTX::Ordering::Volatile;
674 else
675 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
676 : NVPTX::Ordering::Volatile;
677 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
678 // Acquire.
679 case AtomicOrdering::Acquire:
680 if (!N->readMem())
681 report_fatal_error(
682 reason: formatv(Fmt: "PTX only supports Acquire Ordering on reads: {}",
683 Vals: N->getOperationName()));
684 return NVPTX::Ordering::Acquire;
685 case AtomicOrdering::Release:
686 if (!N->writeMem())
687 report_fatal_error(
688 reason: formatv(Fmt: "PTX only supports Release Ordering on writes: {}",
689 Vals: N->getOperationName()));
690 return NVPTX::Ordering::Release;
691 case AtomicOrdering::AcquireRelease: {
692 report_fatal_error(
693 reason: formatv(Fmt: "NVPTX does not support AcquireRelease Ordering on "
694 "read-modify-write "
695 "yet and PTX does not support it on loads or stores: {}",
696 Vals: N->getOperationName()));
697 }
698 case AtomicOrdering::SequentiallyConsistent: {
699 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
700 // sequence including a "fence.sc.sco" and the memory instruction with an
701 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
702 // whether the memory operation is a read, write, or read-modify-write.
703 //
704 // This sets the ordering of the fence to SequentiallyConsistent, and
705 // sets the corresponding ordering for the instruction.
706 NVPTX::Ordering InstrOrder;
707 if (N->readMem())
708 InstrOrder = NVPTX::Ordering::Acquire;
709 else if (N->writeMem())
710 InstrOrder = NVPTX::Ordering::Release;
711 else
712 report_fatal_error(
713 reason: formatv(Fmt: "NVPTX does not support SequentiallyConsistent Ordering on "
714 "read-modify-writes yet: {}",
715 Vals: N->getOperationName()));
716 return OperationOrderings(InstrOrder,
717 NVPTX::Ordering::SequentiallyConsistent);
718 }
719 }
720 report_fatal_error(
721 reason: formatv(Fmt: "NVPTX backend does not support AtomicOrdering \"{}\" yet.",
722 Vals: toIRString(ao: Ordering)));
723}
724
725} // namespace
726
727NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
728 NVPTX::Ordering O) const {
729 switch (O) {
730 case NVPTX::Ordering::NotAtomic:
731 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
732 // NVPTX uses Thread scope as the scope of non-atomic operations.
733 return NVPTX::Scope::Thread;
734 case NVPTX::Ordering::RelaxedMMIO:
735 // RelaxedMMIO operations are always system scope.
736 // If a RelaxedMMIO order was generated from an atomic volatile operation
737 // with a smaller thread scope, we bump it here to system scope.
738 return NVPTX::Scope::System;
739 case NVPTX::Ordering::Relaxed:
740 case NVPTX::Ordering::Acquire:
741 case NVPTX::Ordering::Release:
742 case NVPTX::Ordering::AcquireRelease:
743 case NVPTX::Ordering::SequentiallyConsistent:
744 auto S = Scopes[N->getSyncScopeID()];
745
746 // Atomic operations must have a scope greater than thread.
747 if (S == NVPTX::Scope::Thread)
748 report_fatal_error(
749 reason: formatv(Fmt: "Atomics need scope > \"{}\".", Vals: ScopeToString(S)));
750
751 // If scope is cluster, clusters must be supported.
752 if (S == NVPTX::Scope::Cluster)
753 Subtarget->failIfClustersUnsupported(FailureMessage: "cluster scope");
754
755 // If operation is volatile, then its scope is system.
756 return N->isVolatile() ? NVPTX::Scope::System : S;
757 }
758 llvm_unreachable("unhandled ordering");
759}
760
761static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
762 unsigned CodeAddrSpace) {
763 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
764 // space.
765 return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
766 N.isInvariant();
767}
768
769static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
770 NVPTXSubtarget const *T) {
771 if (S == NVPTX::Scope::Cluster)
772 T->failIfClustersUnsupported(FailureMessage: ".cluster scope fence");
773
774 // Fall back to .acq_rel if .acquire, .release is not supported.
775 if (!T->hasSplitAcquireAndReleaseFences() &&
776 (O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release))
777 O = NVPTX::Ordering::AcquireRelease;
778
779 switch (O) {
780 case NVPTX::Ordering::Acquire:
781 switch (S) {
782 case NVPTX::Scope::System:
783 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
784 : NVPTX::INT_MEMBAR_SYS;
785 case NVPTX::Scope::Block:
786 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
787 : NVPTX::INT_MEMBAR_CTA;
788 case NVPTX::Scope::Cluster:
789 return NVPTX::atomic_thread_fence_acquire_cluster;
790 case NVPTX::Scope::Device:
791 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
792 : NVPTX::INT_MEMBAR_GL;
793 case NVPTX::Scope::Thread:
794 report_fatal_error(
795 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
796 Vals: ScopeToString(S)));
797 }
798 break;
799 case NVPTX::Ordering::Release:
800 switch (S) {
801 case NVPTX::Scope::System:
802 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
803 : NVPTX::INT_MEMBAR_SYS;
804 case NVPTX::Scope::Block:
805 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
806 : NVPTX::INT_MEMBAR_CTA;
807 case NVPTX::Scope::Cluster:
808 return NVPTX::atomic_thread_fence_release_cluster;
809 case NVPTX::Scope::Device:
810 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
811 : NVPTX::INT_MEMBAR_GL;
812 case NVPTX::Scope::Thread:
813 report_fatal_error(
814 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
815 Vals: ScopeToString(S)));
816 }
817 break;
818 case NVPTX::Ordering::AcquireRelease: {
819 switch (S) {
820 case NVPTX::Scope::System:
821 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
822 : NVPTX::INT_MEMBAR_SYS;
823 case NVPTX::Scope::Block:
824 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
825 : NVPTX::INT_MEMBAR_CTA;
826 case NVPTX::Scope::Cluster:
827 return NVPTX::atomic_thread_fence_acq_rel_cluster;
828 case NVPTX::Scope::Device:
829 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
830 : NVPTX::INT_MEMBAR_GL;
831 case NVPTX::Scope::Thread:
832 report_fatal_error(
833 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
834 Vals: ScopeToString(S)));
835 }
836 break;
837 }
838 case NVPTX::Ordering::SequentiallyConsistent: {
839 switch (S) {
840 case NVPTX::Scope::System:
841 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
842 : NVPTX::INT_MEMBAR_SYS;
843 case NVPTX::Scope::Block:
844 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
845 : NVPTX::INT_MEMBAR_CTA;
846 case NVPTX::Scope::Cluster:
847 return NVPTX::atomic_thread_fence_seq_cst_cluster;
848 case NVPTX::Scope::Device:
849 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
850 : NVPTX::INT_MEMBAR_GL;
851 case NVPTX::Scope::Thread:
852 report_fatal_error(reason: formatv(Fmt: "Unsupported scope \"{}\" for seq_cst fence.",
853 Vals: ScopeToString(S)));
854 }
855 break;
856 }
857 case NVPTX::Ordering::NotAtomic:
858 case NVPTX::Ordering::Relaxed:
859 case NVPTX::Ordering::Volatile:
860 case NVPTX::Ordering::RelaxedMMIO:
861 report_fatal_error(
862 reason: formatv(Fmt: "Unsupported \"{}\" ordering and \"{}\" scope for fence.",
863 Vals: OrderingToString(Order: O), Vals: ScopeToString(S)));
864 }
865 llvm_unreachable("unhandled ordering");
866}
867
868// Returns Memory Order and Scope of a memory instruction, and
869// inserts any fence before the instruction that's required to
870// implement its memory ordering.
871std::pair<NVPTX::Ordering, NVPTX::Scope>
872NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
873 MemSDNode *N) {
874 auto [InstructionOrdering, FenceOrdering] =
875 getOperationOrderings(N, Subtarget);
876 auto Scope = getOperationScope(N, O: InstructionOrdering);
877
878 // If a fence is required before the operation, insert it:
879 switch (NVPTX::Ordering(FenceOrdering)) {
880 case NVPTX::Ordering::NotAtomic:
881 break;
882 case NVPTX::Ordering::SequentiallyConsistent: {
883 auto Op = getFenceOp(O: FenceOrdering, S: Scope, T: Subtarget);
884 Chain = SDValue(CurDAG->getMachineNode(Opcode: Op, dl: DL, VT: MVT::Other, Op1: Chain), 0);
885 break;
886 }
887 default:
888 report_fatal_error(
889 reason: formatv(Fmt: "Unexpected fence ordering: \"{}\".",
890 Vals: OrderingToString(Order: NVPTX::Ordering(FenceOrdering))));
891 }
892 return {InstructionOrdering, Scope};
893}
894
895void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
896 SDValue Src = N->getOperand(Num: 0);
897 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(Val: N);
898 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
899 unsigned DstAddrSpace = CastN->getDestAddressSpace();
900 SDLoc DL(N);
901 assert(SrcAddrSpace != DstAddrSpace &&
902 "addrspacecast must be between different address spaces");
903
904 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
905 // Specific to generic
906
907 if (TM.is64Bit() && TM.getPointerSizeInBits(AS: SrcAddrSpace) == 32) {
908 SDValue CvtNone =
909 CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, VT: MVT::i32);
910 SDNode *Cvt = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u64_u32, dl: DL, VT: MVT::i64,
911 Op1: Src, Op2: CvtNone);
912 Src = SDValue(Cvt, 0);
913 }
914
915 unsigned Opc;
916 switch (SrcAddrSpace) {
917 default: report_fatal_error(reason: "Bad address space in addrspacecast");
918 case ADDRESS_SPACE_GLOBAL:
919 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
920 break;
921 case ADDRESS_SPACE_SHARED:
922 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
923 break;
924 case ADDRESS_SPACE_SHARED_CLUSTER:
925 if (!TM.is64Bit())
926 report_fatal_error(
927 reason: "Shared cluster address space is only supported in 64-bit mode");
928 Opc = NVPTX::cvta_shared_cluster_64;
929 break;
930 case ADDRESS_SPACE_CONST:
931 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
932 break;
933 case ADDRESS_SPACE_LOCAL:
934 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
935 break;
936 case ADDRESS_SPACE_PARAM:
937 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
938 break;
939 }
940 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: N->getValueType(ResNo: 0), Op1: Src));
941 return;
942 } else {
943 // Generic to specific
944 if (SrcAddrSpace != 0)
945 report_fatal_error(reason: "Cannot cast between two non-generic address spaces");
946 unsigned Opc;
947 switch (DstAddrSpace) {
948 default: report_fatal_error(reason: "Bad address space in addrspacecast");
949 case ADDRESS_SPACE_GLOBAL:
950 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
951 break;
952 case ADDRESS_SPACE_SHARED:
953 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
954 break;
955 case ADDRESS_SPACE_SHARED_CLUSTER:
956 if (!TM.is64Bit())
957 report_fatal_error(
958 reason: "Shared cluster address space is only supported in 64-bit mode");
959 Opc = NVPTX::cvta_to_shared_cluster_64;
960 break;
961 case ADDRESS_SPACE_CONST:
962 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
963 break;
964 case ADDRESS_SPACE_LOCAL:
965 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
966 break;
967 case ADDRESS_SPACE_PARAM:
968 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
969 break;
970 }
971
972 SDNode *CVTA = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: N->getValueType(ResNo: 0), Op1: Src);
973 if (TM.is64Bit() && TM.getPointerSizeInBits(AS: DstAddrSpace) == 32) {
974 SDValue CvtNone =
975 CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, VT: MVT::i32);
976 CVTA = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u32_u64, dl: DL, VT: MVT::i32,
977 Op1: SDValue(CVTA, 0), Op2: CvtNone);
978 }
979
980 ReplaceNode(F: N, T: CVTA);
981 return;
982 }
983}
984
985// Helper function template to reduce amount of boilerplate code for
986// opcode selection.
987static std::optional<unsigned>
988pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i8,
989 std::optional<unsigned> Opcode_i16,
990 std::optional<unsigned> Opcode_i32,
991 std::optional<unsigned> Opcode_i64) {
992 switch (VT) {
993 case MVT::i1:
994 case MVT::i8:
995 return Opcode_i8;
996 case MVT::f16:
997 case MVT::i16:
998 case MVT::bf16:
999 return Opcode_i16;
1000 case MVT::v2f16:
1001 case MVT::v2bf16:
1002 case MVT::v2i16:
1003 case MVT::v4i8:
1004 case MVT::i32:
1005 case MVT::f32:
1006 return Opcode_i32;
1007 case MVT::i64:
1008 case MVT::f64:
1009 return Opcode_i64;
1010 default:
1011 return std::nullopt;
1012 }
1013}
1014
1015bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1016 MemSDNode *LD = cast<MemSDNode>(Val: N);
1017 assert(LD->readMem() && "Expected load");
1018
1019 // do not support pre/post inc/dec
1020 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(Val: LD);
1021 if (PlainLoad && PlainLoad->isIndexed())
1022 return false;
1023
1024 const EVT LoadedEVT = LD->getMemoryVT();
1025 if (!LoadedEVT.isSimple())
1026 return false;
1027 const MVT LoadedVT = LoadedEVT.getSimpleVT();
1028
1029 // Address Space Setting
1030 const unsigned CodeAddrSpace = getCodeAddrSpace(N: LD);
1031 if (canLowerToLDG(N: *LD, Subtarget: *Subtarget, CodeAddrSpace))
1032 return tryLDG(N: LD);
1033
1034 SDLoc DL(LD);
1035 SDValue Chain = N->getOperand(Num: 0);
1036 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: LD);
1037
1038 const unsigned FromTypeWidth = LoadedVT.getSizeInBits();
1039
1040 // Vector Setting
1041 const unsigned FromType =
1042 (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1043 ? NVPTX::PTXLdStInstCode::Signed
1044 : NVPTX::PTXLdStInstCode::Untyped;
1045
1046 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1047 FromTypeWidth <= 128 && "Invalid width for load");
1048
1049 // Create the machine instruction DAG
1050 SDValue Offset, Base;
1051 SelectADDR(Addr: N->getOperand(Num: 1), Base, Offset);
1052 SDValue Ops[] = {getI32Imm(Imm: Ordering, DL),
1053 getI32Imm(Imm: Scope, DL),
1054 getI32Imm(Imm: CodeAddrSpace, DL),
1055 getI32Imm(Imm: FromType, DL),
1056 getI32Imm(Imm: FromTypeWidth, DL),
1057 Base,
1058 Offset,
1059 Chain};
1060
1061 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy;
1062 const std::optional<unsigned> Opcode = pickOpcodeForVT(
1063 VT: TargetVT, Opcode_i8: NVPTX::LD_i8, Opcode_i16: NVPTX::LD_i16, Opcode_i32: NVPTX::LD_i32, Opcode_i64: NVPTX::LD_i64);
1064 if (!Opcode)
1065 return false;
1066
1067 SDNode *NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1068 if (!NVPTXLD)
1069 return false;
1070
1071 MachineMemOperand *MemRef = LD->getMemOperand();
1072 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef});
1073
1074 ReplaceNode(F: LD, T: NVPTXLD);
1075 return true;
1076}
1077
1078static unsigned getLoadStoreVectorNumElts(SDNode *N) {
1079 switch (N->getOpcode()) {
1080 case NVPTXISD::LoadV2:
1081 case NVPTXISD::StoreV2:
1082 return 2;
1083 case NVPTXISD::LoadV4:
1084 case NVPTXISD::StoreV4:
1085 return 4;
1086 case NVPTXISD::LoadV8:
1087 case NVPTXISD::StoreV8:
1088 return 8;
1089 default:
1090 llvm_unreachable("Unexpected opcode");
1091 }
1092}
1093
1094bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1095 MemSDNode *LD = cast<MemSDNode>(Val: N);
1096 const EVT MemEVT = LD->getMemoryVT();
1097 if (!MemEVT.isSimple())
1098 return false;
1099 const MVT MemVT = MemEVT.getSimpleVT();
1100
1101 // Address Space Setting
1102 const unsigned CodeAddrSpace = getCodeAddrSpace(N: LD);
1103 if (canLowerToLDG(N: *LD, Subtarget: *Subtarget, CodeAddrSpace))
1104 return tryLDG(N: LD);
1105
1106 const MVT EltVT = LD->getSimpleValueType(ResNo: 0);
1107 SDLoc DL(LD);
1108 SDValue Chain = LD->getChain();
1109 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: LD);
1110
1111 // Type Setting: fromType + fromTypeWidth
1112 //
1113 // Sign : ISD::SEXTLOAD
1114 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1115 // type is integer
1116 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1117 // Read at least 8 bits (predicates are stored as 8-bit values)
1118 // The last operand holds the original LoadSDNode::getExtensionType() value
1119 const unsigned TotalWidth = MemVT.getSizeInBits();
1120 const unsigned ExtensionType =
1121 N->getConstantOperandVal(Num: N->getNumOperands() - 1);
1122 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1123 ? NVPTX::PTXLdStInstCode::Signed
1124 : NVPTX::PTXLdStInstCode::Untyped;
1125
1126 const unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
1127
1128 assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1129 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1130 FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
1131
1132 SDValue Offset, Base;
1133 SelectADDR(Addr: N->getOperand(Num: 1), Base, Offset);
1134 SDValue Ops[] = {getI32Imm(Imm: Ordering, DL),
1135 getI32Imm(Imm: Scope, DL),
1136 getI32Imm(Imm: CodeAddrSpace, DL),
1137 getI32Imm(Imm: FromType, DL),
1138 getI32Imm(Imm: FromTypeWidth, DL),
1139 Base,
1140 Offset,
1141 Chain};
1142
1143 std::optional<unsigned> Opcode;
1144 switch (N->getOpcode()) {
1145 default:
1146 llvm_unreachable("Unexpected opcode");
1147 case NVPTXISD::LoadV2:
1148 Opcode =
1149 pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i8: NVPTX::LDV_i8_v2, Opcode_i16: NVPTX::LDV_i16_v2,
1150 Opcode_i32: NVPTX::LDV_i32_v2, Opcode_i64: NVPTX::LDV_i64_v2);
1151 break;
1152 case NVPTXISD::LoadV4:
1153 Opcode =
1154 pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i8: NVPTX::LDV_i8_v4, Opcode_i16: NVPTX::LDV_i16_v4,
1155 Opcode_i32: NVPTX::LDV_i32_v4, Opcode_i64: NVPTX::LDV_i64_v4);
1156 break;
1157 case NVPTXISD::LoadV8:
1158 Opcode = pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i8: {/* no v8i8 */}, Opcode_i16: {/* no v8i16 */},
1159 Opcode_i32: NVPTX::LDV_i32_v8, Opcode_i64: {/* no v8i64 */});
1160 break;
1161 }
1162 if (!Opcode)
1163 return false;
1164
1165 SDNode *NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1166
1167 MachineMemOperand *MemRef = LD->getMemOperand();
1168 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef});
1169
1170 ReplaceNode(F: LD, T: NVPTXLD);
1171 return true;
1172}
1173
1174bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1175 const EVT LoadedEVT = LD->getMemoryVT();
1176 if (!LoadedEVT.isSimple())
1177 return false;
1178 const MVT LoadedVT = LoadedEVT.getSimpleVT();
1179
1180 SDLoc DL(LD);
1181
1182 const unsigned TotalWidth = LoadedVT.getSizeInBits();
1183 unsigned ExtensionType;
1184 unsigned NumElts;
1185 if (const auto *Load = dyn_cast<LoadSDNode>(Val: LD)) {
1186 ExtensionType = Load->getExtensionType();
1187 NumElts = 1;
1188 } else {
1189 ExtensionType = LD->getConstantOperandVal(Num: LD->getNumOperands() - 1);
1190 NumElts = getLoadStoreVectorNumElts(N: LD);
1191 }
1192 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1193 ? NVPTX::PTXLdStInstCode::Signed
1194 : NVPTX::PTXLdStInstCode::Untyped;
1195
1196 const unsigned FromTypeWidth = TotalWidth / NumElts;
1197
1198 assert(!(LD->getSimpleValueType(0).isVector() &&
1199 ExtensionType != ISD::NON_EXTLOAD));
1200 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1201 FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
1202
1203 SDValue Base, Offset;
1204 SelectADDR(Addr: LD->getOperand(Num: 1), Base, Offset);
1205 SDValue Ops[] = {getI32Imm(Imm: FromType, DL), getI32Imm(Imm: FromTypeWidth, DL), Base,
1206 Offset, LD->getChain()};
1207
1208 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy;
1209 std::optional<unsigned> Opcode;
1210 switch (LD->getOpcode()) {
1211 default:
1212 llvm_unreachable("Unexpected opcode");
1213 case ISD::LOAD:
1214 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i8: NVPTX::LD_GLOBAL_NC_i8,
1215 Opcode_i16: NVPTX::LD_GLOBAL_NC_i16, Opcode_i32: NVPTX::LD_GLOBAL_NC_i32,
1216 Opcode_i64: NVPTX::LD_GLOBAL_NC_i64);
1217 break;
1218 case NVPTXISD::LoadV2:
1219 Opcode = pickOpcodeForVT(
1220 VT: TargetVT, Opcode_i8: NVPTX::LD_GLOBAL_NC_v2i8, Opcode_i16: NVPTX::LD_GLOBAL_NC_v2i16,
1221 Opcode_i32: NVPTX::LD_GLOBAL_NC_v2i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_v2i64);
1222 break;
1223 case NVPTXISD::LoadV4:
1224 Opcode = pickOpcodeForVT(
1225 VT: TargetVT, Opcode_i8: NVPTX::LD_GLOBAL_NC_v4i8, Opcode_i16: NVPTX::LD_GLOBAL_NC_v4i16,
1226 Opcode_i32: NVPTX::LD_GLOBAL_NC_v4i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_v4i64);
1227 break;
1228 case NVPTXISD::LoadV8:
1229 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i8: {/* no v8i8 */}, Opcode_i16: {/* no v8i16 */},
1230 Opcode_i32: NVPTX::LD_GLOBAL_NC_v8i32, Opcode_i64: {/* no v8i64 */});
1231 break;
1232 }
1233 if (!Opcode)
1234 return false;
1235
1236 SDNode *NVPTXLDG = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1237
1238 ReplaceNode(F: LD, T: NVPTXLDG);
1239 return true;
1240}
1241
1242bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1243 auto *LD = cast<MemSDNode>(Val: N);
1244
1245 unsigned NumElts;
1246 switch (N->getOpcode()) {
1247 default:
1248 llvm_unreachable("Unexpected opcode");
1249 case ISD::INTRINSIC_W_CHAIN:
1250 NumElts = 1;
1251 break;
1252 case NVPTXISD::LDUV2:
1253 NumElts = 2;
1254 break;
1255 case NVPTXISD::LDUV4:
1256 NumElts = 4;
1257 break;
1258 }
1259
1260 const MVT::SimpleValueType SelectVT =
1261 MVT::getIntegerVT(BitWidth: LD->getMemoryVT().getSizeInBits() / NumElts).SimpleTy;
1262
1263 // If this is an LDU intrinsic, the address is the third operand. If its an
1264 // LDU SD node (from custom vector handling), then its the second operand
1265 SDValue Addr =
1266 LD->getOperand(Num: LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1267
1268 SDValue Base, Offset;
1269 SelectADDR(Addr, Base, Offset);
1270 SDValue Ops[] = {Base, Offset, LD->getChain()};
1271
1272 std::optional<unsigned> Opcode;
1273 switch (N->getOpcode()) {
1274 default:
1275 llvm_unreachable("Unexpected opcode");
1276 case ISD::INTRINSIC_W_CHAIN:
1277 Opcode =
1278 pickOpcodeForVT(VT: SelectVT, Opcode_i8: NVPTX::LDU_GLOBAL_i8, Opcode_i16: NVPTX::LDU_GLOBAL_i16,
1279 Opcode_i32: NVPTX::LDU_GLOBAL_i32, Opcode_i64: NVPTX::LDU_GLOBAL_i64);
1280 break;
1281 case NVPTXISD::LDUV2:
1282 Opcode = pickOpcodeForVT(VT: SelectVT, Opcode_i8: NVPTX::LDU_GLOBAL_v2i8,
1283 Opcode_i16: NVPTX::LDU_GLOBAL_v2i16, Opcode_i32: NVPTX::LDU_GLOBAL_v2i32,
1284 Opcode_i64: NVPTX::LDU_GLOBAL_v2i64);
1285 break;
1286 case NVPTXISD::LDUV4:
1287 Opcode = pickOpcodeForVT(VT: SelectVT, Opcode_i8: NVPTX::LDU_GLOBAL_v4i8,
1288 Opcode_i16: NVPTX::LDU_GLOBAL_v4i16, Opcode_i32: NVPTX::LDU_GLOBAL_v4i32,
1289 Opcode_i64: {/* no v4i64 */});
1290 break;
1291 }
1292 if (!Opcode)
1293 return false;
1294
1295 SDLoc DL(N);
1296 SDNode *NVPTXLDU = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1297
1298 ReplaceNode(F: LD, T: NVPTXLDU);
1299 return true;
1300}
1301
1302bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1303 MemSDNode *ST = cast<MemSDNode>(Val: N);
1304 assert(ST->writeMem() && "Expected store");
1305 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(Val: ST);
1306 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(Val: ST);
1307 assert((PlainStore || AtomicStore) && "Expected store");
1308
1309 // do not support pre/post inc/dec
1310 if (PlainStore && PlainStore->isIndexed())
1311 return false;
1312
1313 const EVT StoreVT = ST->getMemoryVT();
1314 if (!StoreVT.isSimple())
1315 return false;
1316
1317 // Address Space Setting
1318 const unsigned CodeAddrSpace = getCodeAddrSpace(N: ST);
1319
1320 SDLoc DL(ST);
1321 SDValue Chain = ST->getChain();
1322 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: ST);
1323
1324 // Vector Setting
1325 const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
1326
1327 // Create the machine instruction DAG
1328 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1329
1330 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1331 "Invalid width for store");
1332
1333 SDValue Offset, Base;
1334 SelectADDR(Addr: ST->getBasePtr(), Base, Offset);
1335
1336 SDValue Ops[] = {selectPossiblyImm(V: Value),
1337 getI32Imm(Imm: Ordering, DL),
1338 getI32Imm(Imm: Scope, DL),
1339 getI32Imm(Imm: CodeAddrSpace, DL),
1340 getI32Imm(Imm: ToTypeWidth, DL),
1341 Base,
1342 Offset,
1343 Chain};
1344
1345 const std::optional<unsigned> Opcode =
1346 pickOpcodeForVT(VT: Value.getSimpleValueType().SimpleTy, Opcode_i8: NVPTX::ST_i8,
1347 Opcode_i16: NVPTX::ST_i16, Opcode_i32: NVPTX::ST_i32, Opcode_i64: NVPTX::ST_i64);
1348 if (!Opcode)
1349 return false;
1350
1351 SDNode *NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VT: MVT::Other, Ops);
1352
1353 if (!NVPTXST)
1354 return false;
1355
1356 MachineMemOperand *MemRef = ST->getMemOperand();
1357 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef});
1358 ReplaceNode(F: ST, T: NVPTXST);
1359 return true;
1360}
1361
1362bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1363 MemSDNode *ST = cast<MemSDNode>(Val: N);
1364 const EVT StoreVT = ST->getMemoryVT();
1365 assert(StoreVT.isSimple() && "Store value is not simple");
1366
1367 // Address Space Setting
1368 const unsigned CodeAddrSpace = getCodeAddrSpace(N: ST);
1369 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1370 report_fatal_error(reason: "Cannot store to pointer that points to constant "
1371 "memory space");
1372 }
1373
1374 SDLoc DL(ST);
1375 SDValue Chain = ST->getChain();
1376 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: ST);
1377
1378 // Type Setting: toType + toTypeWidth
1379 // - for integer type, always use 'u'
1380 const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
1381
1382 const unsigned NumElts = getLoadStoreVectorNumElts(N: ST);
1383
1384 SmallVector<SDValue, 16> Ops;
1385 for (auto &V : ST->ops().slice(N: 1, M: NumElts))
1386 Ops.push_back(Elt: selectPossiblyImm(V));
1387 SDValue Addr = N->getOperand(Num: NumElts + 1);
1388 const unsigned ToTypeWidth = TotalWidth / NumElts;
1389
1390 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1391 TotalWidth <= 256 && "Invalid width for store");
1392
1393 SDValue Offset, Base;
1394 SelectADDR(Addr, Base, Offset);
1395
1396 Ops.append(IL: {getI32Imm(Imm: Ordering, DL), getI32Imm(Imm: Scope, DL),
1397 getI32Imm(Imm: CodeAddrSpace, DL), getI32Imm(Imm: ToTypeWidth, DL), Base,
1398 Offset, Chain});
1399
1400 const MVT::SimpleValueType EltVT =
1401 ST->getOperand(Num: 1).getSimpleValueType().SimpleTy;
1402 std::optional<unsigned> Opcode;
1403 switch (ST->getOpcode()) {
1404 default:
1405 return false;
1406 case NVPTXISD::StoreV2:
1407 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i8: NVPTX::STV_i8_v2, Opcode_i16: NVPTX::STV_i16_v2,
1408 Opcode_i32: NVPTX::STV_i32_v2, Opcode_i64: NVPTX::STV_i64_v2);
1409 break;
1410 case NVPTXISD::StoreV4:
1411 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i8: NVPTX::STV_i8_v4, Opcode_i16: NVPTX::STV_i16_v4,
1412 Opcode_i32: NVPTX::STV_i32_v4, Opcode_i64: NVPTX::STV_i64_v4);
1413 break;
1414 case NVPTXISD::StoreV8:
1415 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i8: {/* no v8i8 */}, Opcode_i16: {/* no v8i16 */},
1416 Opcode_i32: NVPTX::STV_i32_v8, Opcode_i64: {/* no v8i64 */});
1417 break;
1418 }
1419
1420 if (!Opcode)
1421 return false;
1422
1423 SDNode *NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VT: MVT::Other, Ops);
1424
1425 MachineMemOperand *MemRef = ST->getMemOperand();
1426 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef});
1427
1428 ReplaceNode(F: ST, T: NVPTXST);
1429 return true;
1430}
1431
1432bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
1433 SDValue Chain = Node->getOperand(Num: 0);
1434 SDValue Offset = Node->getOperand(Num: 2);
1435 SDValue Glue = Node->getOperand(Num: 3);
1436 SDLoc DL(Node);
1437 MemSDNode *Mem = cast<MemSDNode>(Val: Node);
1438
1439 unsigned VecSize;
1440 switch (Node->getOpcode()) {
1441 default:
1442 return false;
1443 case NVPTXISD::LoadParam:
1444 VecSize = 1;
1445 break;
1446 case NVPTXISD::LoadParamV2:
1447 VecSize = 2;
1448 break;
1449 case NVPTXISD::LoadParamV4:
1450 VecSize = 4;
1451 break;
1452 }
1453
1454 EVT EltVT = Node->getValueType(ResNo: 0);
1455 EVT MemVT = Mem->getMemoryVT();
1456
1457 std::optional<unsigned> Opcode;
1458
1459 switch (VecSize) {
1460 default:
1461 return false;
1462 case 1:
1463 Opcode = pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy,
1464 Opcode_i8: NVPTX::LoadParamMemI8, Opcode_i16: NVPTX::LoadParamMemI16,
1465 Opcode_i32: NVPTX::LoadParamMemI32, Opcode_i64: NVPTX::LoadParamMemI64);
1466 break;
1467 case 2:
1468 Opcode =
1469 pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LoadParamMemV2I8,
1470 Opcode_i16: NVPTX::LoadParamMemV2I16, Opcode_i32: NVPTX::LoadParamMemV2I32,
1471 Opcode_i64: NVPTX::LoadParamMemV2I64);
1472 break;
1473 case 4:
1474 Opcode = pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy,
1475 Opcode_i8: NVPTX::LoadParamMemV4I8, Opcode_i16: NVPTX::LoadParamMemV4I16,
1476 Opcode_i32: NVPTX::LoadParamMemV4I32, Opcode_i64: {/* no v4i64 */});
1477 break;
1478 }
1479 if (!Opcode)
1480 return false;
1481
1482 SDVTList VTs;
1483 if (VecSize == 1) {
1484 VTs = CurDAG->getVTList(VT1: EltVT, VT2: MVT::Other, VT3: MVT::Glue);
1485 } else if (VecSize == 2) {
1486 VTs = CurDAG->getVTList(VT1: EltVT, VT2: EltVT, VT3: MVT::Other, VT4: MVT::Glue);
1487 } else {
1488 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
1489 VTs = CurDAG->getVTList(VTs: EVTs);
1490 }
1491
1492 unsigned OffsetVal = Offset->getAsZExtVal();
1493
1494 SmallVector<SDValue, 2> Ops(
1495 {CurDAG->getTargetConstant(Val: OffsetVal, DL, VT: MVT::i32), Chain, Glue});
1496
1497 ReplaceNode(F: Node, T: CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs, Ops));
1498 return true;
1499}
1500
1501// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
1502#define getOpcV2H(ty, opKind0, opKind1) \
1503 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
1504
1505#define getOpcV2H1(ty, opKind0, isImm1) \
1506 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
1507
1508#define getOpcodeForVectorStParamV2(ty, isimm) \
1509 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
1510
1511#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
1512 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
1513
1514#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
1515 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
1516 : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
1517
1518#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
1519 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
1520 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
1521
1522#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
1523 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
1524 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
1525
1526#define getOpcodeForVectorStParamV4(ty, isimm) \
1527 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
1528 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
1529
1530#define getOpcodeForVectorStParam(n, ty, isimm) \
1531 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
1532 : getOpcodeForVectorStParamV4(ty, isimm)
1533
1534static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops,
1535 unsigned NumElts,
1536 MVT::SimpleValueType MemTy,
1537 SelectionDAG *CurDAG, SDLoc DL) {
1538 // Determine which inputs are registers and immediates make new operators
1539 // with constant values
1540 SmallVector<bool, 4> IsImm(NumElts, false);
1541 for (unsigned i = 0; i < NumElts; i++) {
1542 IsImm[i] = (isa<ConstantSDNode>(Val: Ops[i]) || isa<ConstantFPSDNode>(Val: Ops[i]));
1543 if (IsImm[i]) {
1544 SDValue Imm = Ops[i];
1545 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
1546 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Val&: Imm);
1547 const ConstantFP *CF = ConstImm->getConstantFPValue();
1548 Imm = CurDAG->getTargetConstantFP(Val: *CF, DL, VT: Imm->getValueType(ResNo: 0));
1549 } else {
1550 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Val&: Imm);
1551 const ConstantInt *CI = ConstImm->getConstantIntValue();
1552 Imm = CurDAG->getTargetConstant(Val: *CI, DL, VT: Imm->getValueType(ResNo: 0));
1553 }
1554 Ops[i] = Imm;
1555 }
1556 }
1557
1558 // Get opcode for MemTy, size, and register/immediate operand ordering
1559 switch (MemTy) {
1560 case MVT::i8:
1561 return getOpcodeForVectorStParam(NumElts, I8, IsImm);
1562 case MVT::i16:
1563 return getOpcodeForVectorStParam(NumElts, I16, IsImm);
1564 case MVT::i32:
1565 return getOpcodeForVectorStParam(NumElts, I32, IsImm);
1566 case MVT::i64:
1567 assert(NumElts == 2 && "MVT too large for NumElts > 2");
1568 return getOpcodeForVectorStParamV2(I64, IsImm);
1569 case MVT::f32:
1570 return getOpcodeForVectorStParam(NumElts, F32, IsImm);
1571 case MVT::f64:
1572 assert(NumElts == 2 && "MVT too large for NumElts > 2");
1573 return getOpcodeForVectorStParamV2(F64, IsImm);
1574
1575 // These cases don't support immediates, just use the all register version
1576 // and generate moves.
1577 case MVT::i1:
1578 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
1579 : NVPTX::StoreParamV4I8_rrrr;
1580 case MVT::f16:
1581 case MVT::bf16:
1582 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
1583 : NVPTX::StoreParamV4I16_rrrr;
1584 case MVT::v2f16:
1585 case MVT::v2bf16:
1586 case MVT::v2i16:
1587 case MVT::v4i8:
1588 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
1589 : NVPTX::StoreParamV4I32_rrrr;
1590 default:
1591 llvm_unreachable("Cannot select st.param for unknown MemTy");
1592 }
1593}
1594
1595bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
1596 SDLoc DL(N);
1597 SDValue Chain = N->getOperand(Num: 0);
1598 SDValue Param = N->getOperand(Num: 1);
1599 unsigned ParamVal = Param->getAsZExtVal();
1600 SDValue Offset = N->getOperand(Num: 2);
1601 unsigned OffsetVal = Offset->getAsZExtVal();
1602 MemSDNode *Mem = cast<MemSDNode>(Val: N);
1603 SDValue Glue = N->getOperand(Num: N->getNumOperands() - 1);
1604
1605 // How many elements do we have?
1606 unsigned NumElts;
1607 switch (N->getOpcode()) {
1608 default:
1609 llvm_unreachable("Unexpected opcode");
1610 case NVPTXISD::StoreParam:
1611 NumElts = 1;
1612 break;
1613 case NVPTXISD::StoreParamV2:
1614 NumElts = 2;
1615 break;
1616 case NVPTXISD::StoreParamV4:
1617 NumElts = 4;
1618 break;
1619 }
1620
1621 // Build vector of operands
1622 SmallVector<SDValue, 8> Ops;
1623 for (unsigned i = 0; i < NumElts; ++i)
1624 Ops.push_back(Elt: N->getOperand(Num: i + 3));
1625 Ops.append(IL: {CurDAG->getTargetConstant(Val: ParamVal, DL, VT: MVT::i32),
1626 CurDAG->getTargetConstant(Val: OffsetVal, DL, VT: MVT::i32), Chain, Glue});
1627
1628 // Determine target opcode
1629 // If we have an i1, use an 8-bit store. The lowering code in
1630 // NVPTXISelLowering will have already emitted an upcast.
1631 std::optional<unsigned> Opcode;
1632 switch (NumElts) {
1633 default:
1634 llvm_unreachable("Unexpected NumElts");
1635 case 1: {
1636 MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
1637 SDValue Imm = Ops[0];
1638 if (MemTy != MVT::f16 && MemTy != MVT::bf16 &&
1639 (isa<ConstantSDNode>(Val: Imm) || isa<ConstantFPSDNode>(Val: Imm))) {
1640 // Convert immediate to target constant
1641 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
1642 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Val&: Imm);
1643 const ConstantFP *CF = ConstImm->getConstantFPValue();
1644 Imm = CurDAG->getTargetConstantFP(Val: *CF, DL, VT: Imm->getValueType(ResNo: 0));
1645 } else {
1646 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Val&: Imm);
1647 const ConstantInt *CI = ConstImm->getConstantIntValue();
1648 Imm = CurDAG->getTargetConstant(Val: *CI, DL, VT: Imm->getValueType(ResNo: 0));
1649 }
1650 Ops[0] = Imm;
1651 // Use immediate version of store param
1652 Opcode =
1653 pickOpcodeForVT(VT: MemTy, Opcode_i8: NVPTX::StoreParamI8_i, Opcode_i16: NVPTX::StoreParamI16_i,
1654 Opcode_i32: NVPTX::StoreParamI32_i, Opcode_i64: NVPTX::StoreParamI64_i);
1655 } else
1656 Opcode = pickOpcodeForVT(VT: Mem->getMemoryVT().getSimpleVT().SimpleTy,
1657 Opcode_i8: NVPTX::StoreParamI8_r, Opcode_i16: NVPTX::StoreParamI16_r,
1658 Opcode_i32: NVPTX::StoreParamI32_r, Opcode_i64: NVPTX::StoreParamI64_r);
1659 if (Opcode == NVPTX::StoreParamI8_r) {
1660 // Fine tune the opcode depending on the size of the operand.
1661 // This helps to avoid creating redundant COPY instructions in
1662 // InstrEmitter::AddRegisterOperand().
1663 switch (Ops[0].getSimpleValueType().SimpleTy) {
1664 default:
1665 break;
1666 case MVT::i32:
1667 Opcode = NVPTX::StoreParamI8TruncI32_r;
1668 break;
1669 case MVT::i64:
1670 Opcode = NVPTX::StoreParamI8TruncI64_r;
1671 break;
1672 }
1673 }
1674 break;
1675 }
1676 case 2:
1677 case 4: {
1678 MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
1679 Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
1680 break;
1681 }
1682 }
1683
1684 SDVTList RetVTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
1685 SDNode *Ret = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: RetVTs, Ops);
1686 MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand();
1687 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Ret), NewMemRefs: {MemRef});
1688
1689 ReplaceNode(F: N, T: Ret);
1690 return true;
1691}
1692
1693/// SelectBFE - Look for instruction sequences that can be made more efficient
1694/// by using the 'bfe' (bit-field extract) PTX instruction
1695bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1696 SDLoc DL(N);
1697 SDValue LHS = N->getOperand(Num: 0);
1698 SDValue RHS = N->getOperand(Num: 1);
1699 SDValue Len;
1700 SDValue Start;
1701 SDValue Val;
1702 bool IsSigned = false;
1703
1704 if (N->getOpcode() == ISD::AND) {
1705 // Canonicalize the operands
1706 // We want 'and %val, %mask'
1707 if (isa<ConstantSDNode>(Val: LHS) && !isa<ConstantSDNode>(Val: RHS)) {
1708 std::swap(a&: LHS, b&: RHS);
1709 }
1710
1711 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val&: RHS);
1712 if (!Mask) {
1713 // We need a constant mask on the RHS of the AND
1714 return false;
1715 }
1716
1717 // Extract the mask bits
1718 uint64_t MaskVal = Mask->getZExtValue();
1719 if (!isMask_64(Value: MaskVal)) {
1720 // We *could* handle shifted masks here, but doing so would require an
1721 // 'and' operation to fix up the low-order bits so we would trade
1722 // shr+and for bfe+and, which has the same throughput
1723 return false;
1724 }
1725
1726 // How many bits are in our mask?
1727 int64_t NumBits = countr_one(Value: MaskVal);
1728 Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32);
1729
1730 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
1731 // We have a 'srl/and' pair, extract the effective start bit and length
1732 Val = LHS.getNode()->getOperand(Num: 0);
1733 Start = LHS.getNode()->getOperand(Num: 1);
1734 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Val&: Start);
1735 if (StartConst) {
1736 uint64_t StartVal = StartConst->getZExtValue();
1737 // How many "good" bits do we have left? "good" is defined here as bits
1738 // that exist in the original value, not shifted in.
1739 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1740 if (NumBits > GoodBits) {
1741 // Do not handle the case where bits have been shifted in. In theory
1742 // we could handle this, but the cost is likely higher than just
1743 // emitting the srl/and pair.
1744 return false;
1745 }
1746 Start = CurDAG->getTargetConstant(Val: StartVal, DL, VT: MVT::i32);
1747 } else {
1748 // Do not handle the case where the shift amount (can be zero if no srl
1749 // was found) is not constant. We could handle this case, but it would
1750 // require run-time logic that would be more expensive than just
1751 // emitting the srl/and pair.
1752 return false;
1753 }
1754 } else {
1755 // Do not handle the case where the LHS of the and is not a shift. While
1756 // it would be trivial to handle this case, it would just transform
1757 // 'and' -> 'bfe', but 'and' has higher-throughput.
1758 return false;
1759 }
1760 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
1761 if (LHS->getOpcode() == ISD::AND) {
1762 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(Val&: RHS);
1763 if (!ShiftCnst) {
1764 // Shift amount must be constant
1765 return false;
1766 }
1767
1768 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1769
1770 SDValue AndLHS = LHS->getOperand(Num: 0);
1771 SDValue AndRHS = LHS->getOperand(Num: 1);
1772
1773 // Canonicalize the AND to have the mask on the RHS
1774 if (isa<ConstantSDNode>(Val: AndLHS)) {
1775 std::swap(a&: AndLHS, b&: AndRHS);
1776 }
1777
1778 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: AndRHS);
1779 if (!MaskCnst) {
1780 // Mask must be constant
1781 return false;
1782 }
1783
1784 uint64_t MaskVal = MaskCnst->getZExtValue();
1785 uint64_t NumZeros;
1786 uint64_t NumBits;
1787 if (isMask_64(Value: MaskVal)) {
1788 NumZeros = 0;
1789 // The number of bits in the result bitfield will be the number of
1790 // trailing ones (the AND) minus the number of bits we shift off
1791 NumBits = llvm::countr_one(Value: MaskVal) - ShiftAmt;
1792 } else if (isShiftedMask_64(Value: MaskVal)) {
1793 NumZeros = llvm::countr_zero(Val: MaskVal);
1794 unsigned NumOnes = llvm::countr_one(Value: MaskVal >> NumZeros);
1795 // The number of bits in the result bitfield will be the number of
1796 // trailing zeros plus the number of set bits in the mask minus the
1797 // number of bits we shift off
1798 NumBits = NumZeros + NumOnes - ShiftAmt;
1799 } else {
1800 // This is not a mask we can handle
1801 return false;
1802 }
1803
1804 if (ShiftAmt < NumZeros) {
1805 // Handling this case would require extra logic that would make this
1806 // transformation non-profitable
1807 return false;
1808 }
1809
1810 Val = AndLHS;
1811 Start = CurDAG->getTargetConstant(Val: ShiftAmt, DL, VT: MVT::i32);
1812 Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32);
1813
1814 // If pre-shift AND includes the sign bit in the bitfield, we must use
1815 // signed BFE to replicate that bit during bitfield extraction. If the
1816 // sign bit is not part of the mask, unsigned BFE will zero out upper bits
1817 // of the result
1818 if (N->getOpcode() == ISD::SRA)
1819 IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1820 } else if (LHS->getOpcode() == ISD::SHL) {
1821 // Here, we have a pattern like:
1822 //
1823 // (sra (shl val, NN), MM)
1824 // or
1825 // (srl (shl val, NN), MM)
1826 //
1827 // If MM >= NN, we can efficiently optimize this with bfe
1828 Val = LHS->getOperand(Num: 0);
1829
1830 SDValue ShlRHS = LHS->getOperand(Num: 1);
1831 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(Val&: ShlRHS);
1832 if (!ShlCnst) {
1833 // Shift amount must be constant
1834 return false;
1835 }
1836 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1837
1838 SDValue ShrRHS = RHS;
1839 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(Val&: ShrRHS);
1840 if (!ShrCnst) {
1841 // Shift amount must be constant
1842 return false;
1843 }
1844 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1845
1846 // To avoid extra codegen and be profitable, we need Outer >= Inner
1847 if (OuterShiftAmt < InnerShiftAmt) {
1848 return false;
1849 }
1850
1851 // If the outer shift is more than the type size, we have no bitfield to
1852 // extract (since we also check that the inner shift is <= the outer shift
1853 // then this also implies that the inner shift is < the type size)
1854 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1855 return false;
1856 }
1857
1858 Start = CurDAG->getTargetConstant(Val: OuterShiftAmt - InnerShiftAmt, DL,
1859 VT: MVT::i32);
1860 Len = CurDAG->getTargetConstant(Val: Val.getValueSizeInBits() - OuterShiftAmt,
1861 DL, VT: MVT::i32);
1862
1863 if (N->getOpcode() == ISD::SRA) {
1864 // If we have a arithmetic right shift, we need to use the signed bfe
1865 // variant
1866 IsSigned = true;
1867 }
1868 } else {
1869 // No can do...
1870 return false;
1871 }
1872 } else {
1873 // No can do...
1874 return false;
1875 }
1876
1877
1878 unsigned Opc;
1879 // For the BFE operations we form here from "and" and "srl", always use the
1880 // unsigned variants.
1881 if (Val.getValueType() == MVT::i32) {
1882 if (IsSigned) {
1883 Opc = NVPTX::BFE_S32rii;
1884 } else {
1885 Opc = NVPTX::BFE_U32rii;
1886 }
1887 } else if (Val.getValueType() == MVT::i64) {
1888 if (IsSigned) {
1889 Opc = NVPTX::BFE_S64rii;
1890 } else {
1891 Opc = NVPTX::BFE_U64rii;
1892 }
1893 } else {
1894 // We cannot handle this type
1895 return false;
1896 }
1897
1898 SDValue Ops[] = {
1899 Val, Start, Len
1900 };
1901
1902 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: N->getVTList(), Ops));
1903 return true;
1904}
1905
1906// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1907bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1908 EVT VT = SDValue(N, 0).getValueType();
1909 if (VT.getScalarType() != MVT::bf16)
1910 return false;
1911
1912 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1913 if (STI->hasNativeBF16Support(Opcode: N->getOpcode()))
1914 return false;
1915
1916 const bool IsVec = VT.isVector();
1917 assert(!IsVec || VT.getVectorNumElements() == 2);
1918 SDLoc DL(N);
1919 SDValue N0 = N->getOperand(Num: 0);
1920 SDValue N1 = N->getOperand(Num: 1);
1921 SmallVector<SDValue, 3> Operands;
1922 auto GetConstant = [&](float Value) -> SDValue {
1923 // BF16 immediates must be legalized to integer register values
1924 APFloat APF(Value);
1925 bool LosesInfo;
1926 APF.convert(ToSemantics: APFloat::BFloat(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
1927 assert(!LosesInfo);
1928 if (IsVec) {
1929 auto API = APF.bitcastToAPInt();
1930 API = API.concat(NewLSB: API);
1931 auto Const = CurDAG->getTargetConstant(Val: API, DL, VT: MVT::i32);
1932 return SDValue(CurDAG->getMachineNode(Opcode: NVPTX::IMOV32i, dl: DL, VT, Op1: Const), 0);
1933 }
1934 auto Const = CurDAG->getTargetConstantFP(Val: APF, DL, VT);
1935 return SDValue(CurDAG->getMachineNode(Opcode: NVPTX::BFMOV16i, dl: DL, VT, Op1: Const), 0);
1936 };
1937
1938 switch (N->getOpcode()) {
1939 case ISD::FADD:
1940 // add(a, b) -> fma(a, 1.0, b)
1941 Operands = {N0, GetConstant(1.0), N1};
1942 break;
1943 case ISD::FSUB:
1944 // sub(a, b) -> fma(b, -1.0, a)
1945 Operands = {N1, GetConstant(-1.0), N0};
1946 break;
1947 case ISD::FMUL:
1948 // mul(a, b) -> fma(a, b, -0.0)
1949 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1950 Operands = {N0, N1, GetConstant(-0.0)};
1951 break;
1952 default:
1953 llvm_unreachable("Unexpected opcode");
1954 };
1955
1956 int Opcode = IsVec ? NVPTX::BFMA16x2rrr : NVPTX::BFMA16rrr;
1957 MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, dl: DL, VT, Ops: Operands);
1958 ReplaceNode(F: N, T: FMA);
1959 return true;
1960}
1961
1962static inline bool isAddLike(const SDValue V) {
1963 return V.getOpcode() == ISD::ADD ||
1964 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
1965}
1966
1967// selectBaseADDR - Match a dag node which will serve as the base address for an
1968// ADDR operand pair.
1969static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
1970 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N))
1971 return DAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N),
1972 VT: GA->getValueType(ResNo: 0), offset: GA->getOffset(),
1973 TargetFlags: GA->getTargetFlags());
1974 if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: N))
1975 return DAG->getTargetExternalSymbol(Sym: ES->getSymbol(), VT: ES->getValueType(ResNo: 0),
1976 TargetFlags: ES->getTargetFlags());
1977 if (const auto *FIN = dyn_cast<FrameIndexSDNode>(Val&: N))
1978 return DAG->getTargetFrameIndex(FI: FIN->getIndex(), VT: FIN->getValueType(ResNo: 0));
1979
1980 return N;
1981}
1982
1983static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
1984 APInt AccumulatedOffset(64u, 0);
1985 while (isAddLike(V: Addr)) {
1986 const auto *CN = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1987 if (!CN)
1988 break;
1989
1990 const APInt CI = CN->getAPIntValue().sext(width: 64);
1991 if (!(CI + AccumulatedOffset).isSignedIntN(N: 32))
1992 break;
1993
1994 AccumulatedOffset += CI;
1995 Addr = Addr->getOperand(Num: 0);
1996 }
1997 return DAG->getSignedTargetConstant(Val: AccumulatedOffset.getSExtValue(), DL,
1998 VT: MVT::i32);
1999}
2000
2001// Select a pair of operands which represent a valid PTX address, this could be
2002// one of the following things:
2003// - [var] - Offset is simply set to 0
2004// - [reg] - Offset is simply set to 0
2005// - [reg+immOff]
2006// - [var+immOff]
2007// Note that immOff must fit into a 32-bit signed integer.
2008bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
2009 SDValue &Offset) {
2010 Offset = accumulateOffset(Addr, DL: SDLoc(Addr), DAG: CurDAG);
2011 Base = selectBaseADDR(N: Addr, DAG: CurDAG);
2012 return true;
2013}
2014
2015SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
2016 if (V.getOpcode() == ISD::BITCAST)
2017 V = V.getOperand(i: 0);
2018
2019 if (auto *CN = dyn_cast<ConstantSDNode>(Val&: V))
2020 return CurDAG->getTargetConstant(Val: CN->getAPIntValue(), DL: SDLoc(V),
2021 VT: V.getValueType());
2022 if (auto *CN = dyn_cast<ConstantFPSDNode>(Val&: V))
2023 return CurDAG->getTargetConstantFP(Val: CN->getValueAPF(), DL: SDLoc(V),
2024 VT: V.getValueType());
2025 return V;
2026}
2027
2028bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
2029 unsigned int spN) const {
2030 const Value *Src = nullptr;
2031 if (MemSDNode *mN = dyn_cast<MemSDNode>(Val: N)) {
2032 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
2033 return true;
2034 Src = mN->getMemOperand()->getValue();
2035 }
2036 if (!Src)
2037 return false;
2038 if (auto *PT = dyn_cast<PointerType>(Val: Src->getType()))
2039 return (PT->getAddressSpace() == spN);
2040 return false;
2041}
2042
2043/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
2044/// inline asm expressions.
2045bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
2046 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
2047 std::vector<SDValue> &OutOps) {
2048 SDValue Op0, Op1;
2049 switch (ConstraintID) {
2050 default:
2051 return true;
2052 case InlineAsm::ConstraintCode::m: // memory
2053 if (SelectADDR(Addr: Op, Base&: Op0, Offset&: Op1)) {
2054 OutOps.push_back(x: Op0);
2055 OutOps.push_back(x: Op1);
2056 return false;
2057 }
2058 break;
2059 }
2060 return true;
2061}
2062
2063void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
2064 // Lower a CopyToReg with two 64-bit inputs
2065 // Dst:i128, lo:i64, hi:i64
2066 //
2067 // CopyToReg Dst, lo, hi;
2068 //
2069 // ==>
2070 //
2071 // tmp = V2I64toI128 {lo, hi};
2072 // CopyToReg Dst, tmp;
2073 SDValue Dst = N->getOperand(Num: 1);
2074 SDValue Lo = N->getOperand(Num: 2);
2075 SDValue Hi = N->getOperand(Num: 3);
2076
2077 SDLoc DL(N);
2078 SDNode *Mov =
2079 CurDAG->getMachineNode(Opcode: NVPTX::V2I64toI128, dl: DL, VT: MVT::i128, Ops: {Lo, Hi});
2080
2081 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
2082 NewOps[0] = N->getOperand(Num: 0);
2083 NewOps[1] = Dst;
2084 NewOps[2] = SDValue(Mov, 0);
2085 if (N->getNumOperands() == 5)
2086 NewOps[3] = N->getOperand(Num: 4);
2087 SDValue NewValue = CurDAG->getNode(Opcode: ISD::CopyToReg, DL, ResultTys: SmallVector<EVT>(N->values()), Ops: NewOps);
2088
2089 ReplaceNode(F: N, T: NewValue.getNode());
2090}
2091
2092void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
2093 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
2094 // Dst:i128, Src:i128
2095 //
2096 // {lo, hi} = CopyFromReg Src
2097 //
2098 // ==>
2099 //
2100 // {lo, hi} = I128toV2I64 Src
2101 //
2102 SDValue Ch = N->getOperand(Num: 0);
2103 SDValue Src = N->getOperand(Num: 1);
2104 SDValue Glue = N->getOperand(Num: 2);
2105 SDLoc DL(N);
2106
2107 // Add Glue and Ch to the operands and results to avoid break the execution
2108 // order
2109 SDNode *Mov = CurDAG->getMachineNode(
2110 Opcode: NVPTX::I128toV2I64, dl: DL,
2111 ResultTys: {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
2112 Ops: {Src, Ch, Glue});
2113
2114 ReplaceNode(F: N, T: Mov);
2115}
2116
2117bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
2118 SDLoc DL(N);
2119 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
2120 unsigned int FenceOp =
2121 getFenceOp(O: NVPTX::Ordering(N->getConstantOperandVal(Num: 1)),
2122 S: Scopes[N->getConstantOperandVal(Num: 2)], T: Subtarget);
2123 SDValue Chain = N->getOperand(Num: 0);
2124 SDNode *FenceNode = CurDAG->getMachineNode(Opcode: FenceOp, dl: DL, VT: MVT::Other, Op1: Chain);
2125 ReplaceNode(F: N, T: FenceNode);
2126 return true;
2127}
2128
2129NVPTXScopes::NVPTXScopes(LLVMContext &C) {
2130 Scopes[C.getOrInsertSyncScopeID(SSN: "singlethread")] = NVPTX::Scope::Thread;
2131 Scopes[C.getOrInsertSyncScopeID(SSN: "")] = NVPTX::Scope::System;
2132 Scopes[C.getOrInsertSyncScopeID(SSN: "block")] = NVPTX::Scope::Block;
2133 Scopes[C.getOrInsertSyncScopeID(SSN: "cluster")] = NVPTX::Scope::Cluster;
2134 Scopes[C.getOrInsertSyncScopeID(SSN: "device")] = NVPTX::Scope::Device;
2135}
2136
2137NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
2138 if (Scopes.empty())
2139 llvm_unreachable("NVPTX Scopes must be initialized before calling "
2140 "NVPTXScopes::operator[]");
2141
2142 auto S = Scopes.find(Key: ID);
2143 if (S == Scopes.end()) {
2144 // TODO:
2145 // - Add API to LLVMContext to get the name of a single scope.
2146 // - Use that API here to print an error containing the name
2147 // of this Unknown ID.
2148 report_fatal_error(reason: formatv(Fmt: "Could not find scope ID={}.", Vals: int(ID)));
2149 }
2150 return S->second;
2151}
2152
2153bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
2154
2155#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
2156 (is_s32 \
2157 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
2158 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
2159
2160#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \
2161 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2162 : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2163
2164#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \
2165 is_s32) \
2166 (is_reduce \
2167 ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2168 : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2169 is_s32)))
2170
2171#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
2172 [&]() -> auto { \
2173 if (is_mc && is_ch) \
2174 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
2175 if (is_ch) \
2176 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
2177 if (is_mc) \
2178 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
2179 return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
2180 }()
2181
2182#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \
2183 (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2184 : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2185
2186static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
2187 bool IsCacheHint, bool IsIm2Col,
2188 bool IsReduce = false) {
2189 if (IsIm2Col) {
2190 switch (Dim) {
2191 case 3:
2192 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce,
2193 IsCacheHint, IsShared32);
2194 case 4:
2195 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce,
2196 IsCacheHint, IsShared32);
2197 case 5:
2198 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce,
2199 IsCacheHint, IsShared32);
2200 default:
2201 llvm_unreachable("Invalid Dimension in im2col mode for "
2202 "GetCpAsyncBulkTensorS2GOpcode.");
2203 }
2204 } else {
2205 switch (Dim) {
2206 case 1:
2207 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce,
2208 IsCacheHint, IsShared32);
2209 case 2:
2210 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce,
2211 IsCacheHint, IsShared32);
2212 case 3:
2213 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce,
2214 IsCacheHint, IsShared32);
2215 case 4:
2216 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce,
2217 IsCacheHint, IsShared32);
2218 case 5:
2219 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce,
2220 IsCacheHint, IsShared32);
2221 default:
2222 llvm_unreachable(
2223 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
2224 }
2225 }
2226}
2227
2228static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
2229 bool IsMultiCast,
2230 bool IsCacheHint, bool IsIm2Col) {
2231 if (IsIm2Col) {
2232 switch (Dim) {
2233 case 3:
2234 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
2235 IsCacheHint, IsShared32);
2236 case 4:
2237 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
2238 IsCacheHint, IsShared32);
2239 case 5:
2240 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
2241 IsCacheHint, IsShared32);
2242 default:
2243 llvm_unreachable("Invalid Dimension in im2col mode for "
2244 "GetCpAsyncBulkTensorG2SOpcode.");
2245 }
2246 } else {
2247 switch (Dim) {
2248 case 1:
2249 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
2250 IsCacheHint, IsShared32);
2251 case 2:
2252 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
2253 IsCacheHint, IsShared32);
2254 case 3:
2255 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
2256 IsCacheHint, IsShared32);
2257 case 4:
2258 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
2259 IsCacheHint, IsShared32);
2260 case 5:
2261 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
2262 IsCacheHint, IsShared32);
2263 default:
2264 llvm_unreachable(
2265 "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
2266 }
2267 }
2268}
2269
2270static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
2271 bool IsIm2Col) {
2272 if (IsIm2Col) {
2273 switch (Dim) {
2274 case 3:
2275 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint);
2276 case 4:
2277 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint);
2278 case 5:
2279 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint);
2280 default:
2281 llvm_unreachable("Invalid Dimension in im2col mode for "
2282 "GetCpAsyncBulkTensorPrefetchOpcode.");
2283 }
2284 } else {
2285 switch (Dim) {
2286 case 1:
2287 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint);
2288 case 2:
2289 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint);
2290 case 3:
2291 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint);
2292 case 4:
2293 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint);
2294 case 5:
2295 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint);
2296 default:
2297 llvm_unreachable("Invalid Dimension in tile mode for "
2298 "GetCpAsyncBulkTensorPrefetchOpcode.");
2299 }
2300 }
2301}
2302
2303static size_t GetDimsFromIntrinsic(unsigned IID) {
2304 switch (IID) {
2305 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2306 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2307 return 3;
2308 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2309 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2310 return 4;
2311 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2312 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2313 return 5;
2314 default:
2315 llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
2316 }
2317}
2318
2319void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
2320 bool IsIm2Col) {
2321 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2322 // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
2323 // multicast, cache_hint,
2324 // multicast_flag, cache_hint_flag, cta_group_flag}
2325 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2326 // = {2} + {8 + dims + im2col_offsets}
2327 size_t NumOps = N->getNumOperands();
2328 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(IID: N->getConstantOperandVal(Num: 1))
2329 : (NumOps - 10);
2330 // Offsets is always 'NumDims - 2' and only for im2col mode
2331 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
2332 bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - 2) == 1;
2333 bool IsMultiCast = N->getConstantOperandVal(Num: NumOps - 3) == 1;
2334 size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
2335 size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
2336
2337 unsigned CTAGroupVal = N->getConstantOperandVal(Num: NumOps - 1);
2338 if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
2339 report_fatal_error(
2340 reason: formatv(Fmt: "CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2341 Vals: Subtarget->getSmVersion()));
2342
2343 SDLoc DL(N);
2344 SmallVector<SDValue, 8> Ops(N->ops().slice(N: 2, M: NumBaseArgs));
2345
2346 // Push MultiCast operand, if available
2347 if (IsMultiCast)
2348 Ops.push_back(Elt: N->getOperand(Num: MultiCastIdx));
2349
2350 // Push CacheHint operand, if available
2351 if (IsCacheHint)
2352 Ops.push_back(Elt: N->getOperand(Num: MultiCastIdx + 1));
2353
2354 // Flag for CTA Group
2355 Ops.push_back(Elt: getI32Imm(Imm: CTAGroupVal, DL));
2356
2357 // Finally, the chain operand
2358 Ops.push_back(Elt: N->getOperand(Num: 0));
2359
2360 bool IsShared32 =
2361 CurDAG->getDataLayout().getPointerSizeInBits(AS: ADDRESS_SPACE_SHARED) == 32;
2362 unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
2363 Dim: NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2364 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
2365}
2366
2367void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
2368 bool IsIm2Col) {
2369 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2370 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2371 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2372 // = {2} + {4 + dims}
2373 size_t NumOps = N->getNumOperands();
2374 size_t NumDims = NumOps - 6;
2375 bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - 1) == 1;
2376 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2377
2378 SDLoc DL(N);
2379 SmallVector<SDValue, 8> Ops(N->ops().slice(N: 2, M: NumArgs));
2380 Ops.push_back(Elt: N->getOperand(Num: 0)); // Chain operand
2381
2382 bool IsShared32 =
2383 CurDAG->getDataLayout().getPointerSizeInBits(AS: ADDRESS_SPACE_SHARED) == 32;
2384 unsigned Opcode =
2385 GetCpAsyncBulkTensorS2GOpcode(Dim: NumDims, IsShared32, IsCacheHint, IsIm2Col);
2386 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
2387}
2388
2389void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
2390 bool IsIm2Col) {
2391 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2392 // {src, dims{d0...dN}, im2col_offsets{dims-2}
2393 // cache_hint, cache_hint_flag}
2394 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2395 // = {2} + {3 + dims + im2col_offsets}
2396 size_t NumOps = N->getNumOperands();
2397 size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(IID: N->getConstantOperandVal(Num: 1))
2398 : (NumOps - 5);
2399 // Offsets is always 'NumDims - 2' and only for im2col mode
2400 size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
2401 bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - 1) == 1;
2402 size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);
2403
2404 SDLoc DL(N);
2405 SmallVector<SDValue, 12> Ops(N->ops().slice(N: 2, M: NumArgs));
2406 Ops.push_back(Elt: N->getOperand(Num: 0)); // Chain operand
2407
2408 unsigned Opcode =
2409 GetCpAsyncBulkTensorPrefetchOpcode(Dim: NumDims, IsCacheHint, IsIm2Col);
2410 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
2411}
2412
2413void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
2414 unsigned RedOp,
2415 bool IsIm2Col) {
2416 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2417 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2418 // NumOperands = {Chain, IID} + {Actual intrinsic args}
2419 // = {2} + {4 + dims}
2420 size_t NumOps = N->getNumOperands();
2421 size_t NumDims = NumOps - 6;
2422 bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - 1) == 1;
2423 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2424
2425 SDLoc DL(N);
2426 SmallVector<SDValue, 12> Ops(N->ops().slice(N: 2, M: NumArgs));
2427 Ops.push_back(Elt: getI32Imm(Imm: RedOp, DL)); // Reduction Op
2428 Ops.push_back(Elt: N->getOperand(Num: 0)); // Chain operand
2429
2430 bool IsShared32 =
2431 CurDAG->getDataLayout().getPointerSizeInBits(AS: ADDRESS_SPACE_SHARED) == 32;
2432 unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode(
2433 Dim: NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true);
2434 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
2435}
2436
2437#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2438 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2439 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2440
2441static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
2442 switch (IID) {
2443 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2444 return TCGEN05_ST_OPCODE(16x64b, x1);
2445 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2446 return TCGEN05_ST_OPCODE(16x64b, x2);
2447 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2448 return TCGEN05_ST_OPCODE(16x64b, x4);
2449 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2450 return TCGEN05_ST_OPCODE(16x64b, x8);
2451 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2452 return TCGEN05_ST_OPCODE(16x64b, x16);
2453 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2454 return TCGEN05_ST_OPCODE(16x64b, x32);
2455 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2456 return TCGEN05_ST_OPCODE(16x64b, x64);
2457 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2458 return TCGEN05_ST_OPCODE(16x64b, x128);
2459 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2460 return TCGEN05_ST_OPCODE(16x128b, x1);
2461 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2462 return TCGEN05_ST_OPCODE(16x128b, x2);
2463 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2464 return TCGEN05_ST_OPCODE(16x128b, x4);
2465 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2466 return TCGEN05_ST_OPCODE(16x128b, x8);
2467 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2468 return TCGEN05_ST_OPCODE(16x128b, x16);
2469 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2470 return TCGEN05_ST_OPCODE(16x128b, x32);
2471 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2472 return TCGEN05_ST_OPCODE(16x128b, x64);
2473 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2474 return TCGEN05_ST_OPCODE(16x256b, x1);
2475 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2476 return TCGEN05_ST_OPCODE(16x256b, x2);
2477 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2478 return TCGEN05_ST_OPCODE(16x256b, x4);
2479 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2480 return TCGEN05_ST_OPCODE(16x256b, x8);
2481 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2482 return TCGEN05_ST_OPCODE(16x256b, x16);
2483 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2484 return TCGEN05_ST_OPCODE(16x256b, x32);
2485 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2486 return TCGEN05_ST_OPCODE(16x32bx2, x1);
2487 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2488 return TCGEN05_ST_OPCODE(16x32bx2, x2);
2489 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2490 return TCGEN05_ST_OPCODE(16x32bx2, x4);
2491 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2492 return TCGEN05_ST_OPCODE(16x32bx2, x8);
2493 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2494 return TCGEN05_ST_OPCODE(16x32bx2, x16);
2495 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2496 return TCGEN05_ST_OPCODE(16x32bx2, x32);
2497 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2498 return TCGEN05_ST_OPCODE(16x32bx2, x64);
2499 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2500 return TCGEN05_ST_OPCODE(16x32bx2, x128);
2501 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2502 return TCGEN05_ST_OPCODE(32x32b, x1);
2503 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2504 return TCGEN05_ST_OPCODE(32x32b, x2);
2505 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2506 return TCGEN05_ST_OPCODE(32x32b, x4);
2507 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2508 return TCGEN05_ST_OPCODE(32x32b, x8);
2509 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2510 return TCGEN05_ST_OPCODE(32x32b, x16);
2511 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2512 return TCGEN05_ST_OPCODE(32x32b, x32);
2513 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2514 return TCGEN05_ST_OPCODE(32x32b, x64);
2515 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2516 return TCGEN05_ST_OPCODE(32x32b, x128);
2517 }
2518 llvm_unreachable("unhandled tcgen05.st lowering");
2519}
2520
2521void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
2522 SDLoc DL(N);
2523 unsigned IID = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getZExtValue();
2524
2525 SmallVector<SDValue, 128> Operands = {
2526 N->getOperand(Num: 2) // taddr
2527 };
2528
2529 if (hasOffset)
2530 Operands.push_back(Elt: CurDAG->getTargetConstant(
2531 Val: cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue(), DL,
2532 VT: MVT::i32)); // Offset
2533
2534 for (unsigned I = hasOffset ? 4 : 3; I < (N->getNumOperands() - 1); I++)
2535 Operands.push_back(Elt: N->getOperand(Num: I));
2536
2537 bool enableUnpack =
2538 cast<ConstantSDNode>(Val: N->getOperand(Num: N->getNumOperands() - 1))
2539 ->getZExtValue();
2540
2541 Operands.push_back(Elt: N->getOperand(Num: 0)); // Chain
2542 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: getTcgen05StOpcode(IID, enableUnpack),
2543 dl: DL, VTs: N->getVTList(), Ops: Operands));
2544}
2545
2546bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2547 unsigned IID = N->getConstantOperandVal(Num: 1);
2548 using TMARedTy = llvm::nvvm::TMAReductionOp;
2549 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2550 switch (IID) {
2551 default:
2552 return false;
2553 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
2554 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
2555 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
2556 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
2557 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
2558 SelectCpAsyncBulkTensorS2GCommon(N);
2559 return true;
2560 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
2561 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
2562 case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
2563 SelectCpAsyncBulkTensorS2GCommon(N, /*IsIm2Col=*/true);
2564 return true;
2565 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2566 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2567 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2568 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2569 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2570 SelectCpAsyncBulkTensorG2SCommon(N);
2571 return true;
2572 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2573 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2574 case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2575 SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
2576 return true;
2577 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
2578 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
2579 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
2580 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
2581 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
2582 SelectCpAsyncBulkTensorPrefetchCommon(N);
2583 return true;
2584 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2585 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2586 case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2587 SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
2588 return true;
2589 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2590 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2591 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2592 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2593 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2594 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::ADD));
2595 return true;
2596 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2597 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2598 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2599 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::ADD),
2600 /*IsIm2Col=*/true);
2601 return true;
2602 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2603 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2604 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2605 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2606 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2607 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MIN));
2608 return true;
2609 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2610 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2611 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2612 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MIN),
2613 /*IsIm2Col=*/true);
2614 return true;
2615 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2616 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2617 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2618 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2619 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2620 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MAX));
2621 return true;
2622 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2623 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2624 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2625 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MAX),
2626 /*IsIm2Col=*/true);
2627 return true;
2628 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2629 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2630 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2631 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2632 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2633 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::INC));
2634 return true;
2635 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2636 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2637 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2638 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::INC),
2639 /*IsIm2Col=*/true);
2640 return true;
2641 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2642 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2643 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2644 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2645 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2646 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::DEC));
2647 return true;
2648 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2649 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2650 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2651 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::DEC),
2652 /*IsIm2Col=*/true);
2653 return true;
2654 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2655 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2656 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2657 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2658 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2659 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::AND));
2660 return true;
2661 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2662 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2663 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2664 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::AND),
2665 /*IsIm2Col=*/true);
2666 return true;
2667 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2668 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2669 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2670 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2671 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2672 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::OR));
2673 return true;
2674 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2675 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2676 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2677 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::OR),
2678 /*IsIm2Col=*/true);
2679 return true;
2680 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2681 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2682 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2683 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2684 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2685 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::XOR));
2686 return true;
2687 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2688 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2689 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2690 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::XOR),
2691 /*IsIm2Col=*/true);
2692 return true;
2693
2694 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2695 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2696 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2697 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2698 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2699 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2700 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2701 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2702 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2703 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2704 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2705 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2706 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2707 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2708 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2709 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2710 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2711 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2712 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2713 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2714 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2715 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2716 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2717 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2718 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2719 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2720 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2721 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2722 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2723 SelectTcgen05St(N);
2724 return true;
2725 }
2726
2727 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2728 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2729 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2730 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2731 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2732 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2733 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2734 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2735 SelectTcgen05St(N, /* hasOffset */ true);
2736 return true;
2737 }
2738 }
2739}
2740