1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTX.h"
15#include "NVPTXUtilities.h"
16#include "llvm/ADT/APInt.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/CodeGen/ISDOpcodes.h"
19#include "llvm/CodeGen/MachineJumpTableInfo.h"
20#include "llvm/CodeGen/SelectionDAG.h"
21#include "llvm/CodeGen/SelectionDAGNodes.h"
22#include "llvm/IR/GlobalValue.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/IntrinsicsNVPTX.h"
25#include "llvm/IR/NVVMIntrinsicUtils.h"
26#include "llvm/Support/AtomicOrdering.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/ErrorHandling.h"
29#include "llvm/Support/FormatVariadic.h"
30#include "llvm/Support/MathExtras.h"
31#include <optional>
32
33using namespace llvm;
34
35#define DEBUG_TYPE "nvptx-isel"
36#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
37
38static cl::opt<bool>
39 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(Val: true), cl::Hidden,
40 cl::desc("Enable reciprocal sqrt optimization"));
41
42// FIXME: This is a WAR to recover lost performance from #155024.
43// We still need to investigate the regression and find a more permanent
44// solution.
45static cl::opt<bool> EnableMADWide("nvptx-mad-wide-opt", cl::init(Val: false),
46 cl::Hidden,
47 cl::desc("Enable MAD wide optimization"));
48
49/// createNVPTXISelDag - This pass converts a legalized DAG into a
50/// NVPTX-specific DAG, ready for instruction scheduling.
51FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
52 llvm::CodeGenOptLevel OptLevel) {
53 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
54}
55
56NVPTXDAGToDAGISelLegacy::NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm,
57 CodeGenOptLevel OptLevel)
58 : SelectionDAGISelLegacy(
59 ID, std::make_unique<NVPTXDAGToDAGISel>(args&: tm, args&: OptLevel)) {}
60
61char NVPTXDAGToDAGISelLegacy::ID = 0;
62
63INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
64
65NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
66 CodeGenOptLevel OptLevel)
67 : SelectionDAGISel(tm, OptLevel), TM(tm) {}
68
69bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
70 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
71 Scopes = NVPTXScopes(MF.getFunction().getContext());
72 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
73}
74
75NVPTX::DivPrecisionLevel
76NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
77 return Subtarget->getTargetLowering()->getDivF32Level(MF: *MF, N: *N);
78}
79
80bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
81 return Subtarget->getTargetLowering()->usePrecSqrtF32(N);
82}
83
84bool NVPTXDAGToDAGISel::useF32FTZ() const {
85 return Subtarget->getTargetLowering()->useF32FTZ(MF: *MF);
86}
87
88bool NVPTXDAGToDAGISel::allowFMA() const {
89 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
90 return TL->allowFMA(MF&: *MF, OptLevel);
91}
92
93bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
94
95bool NVPTXDAGToDAGISel::doMADWideOpt() const { return EnableMADWide; }
96
97/// Select - Select instructions not customized! Used for
98/// expanded, promoted and normal instructions.
99void NVPTXDAGToDAGISel::Select(SDNode *N) {
100
101 if (N->isMachineOpcode()) {
102 N->setNodeId(-1);
103 return; // Already selected.
104 }
105
106 switch (N->getOpcode()) {
107 case ISD::LOAD:
108 case ISD::ATOMIC_LOAD:
109 case NVPTXISD::MLoad:
110 if (tryLoad(N))
111 return;
112 break;
113 case ISD::STORE:
114 case ISD::ATOMIC_STORE:
115 if (tryStore(N))
116 return;
117 break;
118 case ISD::ATOMIC_FENCE:
119 if (tryFence(N))
120 return;
121 break;
122 case NVPTXISD::UNPACK_VECTOR:
123 tryUNPACK_VECTOR(N);
124 return;
125 case ISD::EXTRACT_VECTOR_ELT:
126 if (tryEXTRACT_VECTOR_ELEMENT(N))
127 return;
128 break;
129 case NVPTXISD::SETP_F16X2:
130 SelectSETP_F16X2(N);
131 return;
132 case NVPTXISD::SETP_BF16X2:
133 SelectSETP_BF16X2(N);
134 return;
135 case NVPTXISD::LoadV2:
136 case NVPTXISD::LoadV4:
137 case NVPTXISD::LoadV8:
138 if (tryLoadVector(N))
139 return;
140 break;
141 case NVPTXISD::LDUV2:
142 case NVPTXISD::LDUV4:
143 if (tryLDU(N))
144 return;
145 break;
146 case NVPTXISD::StoreV2:
147 case NVPTXISD::StoreV4:
148 case NVPTXISD::StoreV8:
149 if (tryStoreVector(N))
150 return;
151 break;
152 case ISD::INTRINSIC_W_CHAIN:
153 if (tryIntrinsicChain(N))
154 return;
155 break;
156 case ISD::INTRINSIC_VOID:
157 if (tryIntrinsicVoid(N))
158 return;
159 break;
160 case ISD::AND:
161 case ISD::SRA:
162 case ISD::SRL:
163 // Try to select BFE
164 if (tryBFE(N))
165 return;
166 break;
167 case ISD::ADDRSPACECAST:
168 SelectAddrSpaceCast(N);
169 return;
170 case ISD::CopyToReg: {
171 if (N->getOperand(Num: 1).getValueType() == MVT::i128) {
172 SelectV2I64toI128(N);
173 return;
174 }
175 break;
176 }
177 case ISD::CopyFromReg: {
178 if (N->getOperand(Num: 1).getValueType() == MVT::i128) {
179 SelectI128toV2I64(N);
180 return;
181 }
182 break;
183 }
184 case NVPTXISD::ATOMIC_CMP_SWAP_B128:
185 case NVPTXISD::ATOMIC_SWAP_B128:
186 selectAtomicSwap128(N);
187 return;
188 case ISD::FADD:
189 case ISD::FMUL:
190 case ISD::FSUB:
191 if (tryBF16ArithToFMA(N))
192 return;
193 break;
194 case ISD::BR_JT:
195 return selectBR_JT(N);
196 default:
197 break;
198 }
199 SelectCode(N);
200}
201
202#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
203 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
204 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
205
206static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
207 switch (IID) {
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
209 return TCGEN05_LD_OPCODE(16x64b, x1);
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
211 return TCGEN05_LD_OPCODE(16x64b, x2);
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
213 return TCGEN05_LD_OPCODE(16x64b, x4);
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
215 return TCGEN05_LD_OPCODE(16x64b, x8);
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
217 return TCGEN05_LD_OPCODE(16x64b, x16);
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
219 return TCGEN05_LD_OPCODE(16x64b, x32);
220 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
221 return TCGEN05_LD_OPCODE(16x64b, x64);
222 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
223 return TCGEN05_LD_OPCODE(16x64b, x128);
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
225 return TCGEN05_LD_OPCODE(16x128b, x1);
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
227 return TCGEN05_LD_OPCODE(16x128b, x2);
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
229 return TCGEN05_LD_OPCODE(16x128b, x4);
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
231 return TCGEN05_LD_OPCODE(16x128b, x8);
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
233 return TCGEN05_LD_OPCODE(16x128b, x16);
234 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
235 return TCGEN05_LD_OPCODE(16x128b, x32);
236 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
237 return TCGEN05_LD_OPCODE(16x128b, x64);
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
239 return TCGEN05_LD_OPCODE(16x256b, x1);
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
241 return TCGEN05_LD_OPCODE(16x256b, x2);
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
243 return TCGEN05_LD_OPCODE(16x256b, x4);
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
245 return TCGEN05_LD_OPCODE(16x256b, x8);
246 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
247 return TCGEN05_LD_OPCODE(16x256b, x16);
248 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
249 return TCGEN05_LD_OPCODE(16x256b, x32);
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
251 return TCGEN05_LD_OPCODE(16x32bx2, x1);
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
253 return TCGEN05_LD_OPCODE(16x32bx2, x2);
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
255 return TCGEN05_LD_OPCODE(16x32bx2, x4);
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
257 return TCGEN05_LD_OPCODE(16x32bx2, x8);
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
259 return TCGEN05_LD_OPCODE(16x32bx2, x16);
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
261 return TCGEN05_LD_OPCODE(16x32bx2, x32);
262 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
263 return TCGEN05_LD_OPCODE(16x32bx2, x64);
264 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
265 return TCGEN05_LD_OPCODE(16x32bx2, x128);
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
267 return TCGEN05_LD_OPCODE(32x32b, x1);
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
269 return TCGEN05_LD_OPCODE(32x32b, x2);
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
271 return TCGEN05_LD_OPCODE(32x32b, x4);
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
273 return TCGEN05_LD_OPCODE(32x32b, x8);
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
275 return TCGEN05_LD_OPCODE(32x32b, x16);
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
277 return TCGEN05_LD_OPCODE(32x32b, x32);
278 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
279 return TCGEN05_LD_OPCODE(32x32b, x64);
280 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
281 return TCGEN05_LD_OPCODE(32x32b, x128);
282 }
283 llvm_unreachable("unhandled tcgen05.ld lowering");
284}
285
286void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
287 if (!Subtarget->hasTcgen05InstSupport())
288 report_fatal_error(
289 reason: "tcgen05.ld is not supported on this architecture variant");
290
291 SDLoc DL(N);
292 unsigned IID = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getZExtValue();
293
294 if (hasOffset) {
295 bool enablePack = cast<ConstantSDNode>(Val: N->getOperand(Num: 4))->getZExtValue();
296 auto OffsetNode = CurDAG->getTargetConstant(
297 Val: cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue(), DL, VT: MVT::i32);
298 ReplaceNode(F: N, T: CurDAG->getMachineNode(
299 Opcode: getTcgen05LdOpcode(IID, enablePack), dl: DL, VTs: N->getVTList(),
300 Ops: {N->getOperand(Num: 2), OffsetNode, N->getOperand(Num: 0)}));
301 } else {
302 bool enablePack = cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue();
303 ReplaceNode(F: N, T: CurDAG->getMachineNode(
304 Opcode: getTcgen05LdOpcode(IID, enablePack), dl: DL, VTs: N->getVTList(),
305 Ops: {N->getOperand(Num: 2), N->getOperand(Num: 0)}));
306 }
307}
308
309bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
310 unsigned IID = N->getConstantOperandVal(Num: 1);
311 switch (IID) {
312 default:
313 return false;
314 case Intrinsic::nvvm_ldu_global_f:
315 case Intrinsic::nvvm_ldu_global_i:
316 case Intrinsic::nvvm_ldu_global_p:
317 return tryLDU(N);
318
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
323 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
324 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
325 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
326 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
329 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
331 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
332 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
334 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
337 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
338 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
339 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
344 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
345 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
346 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
347 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
348 SelectTcgen05Ld(N);
349 return true;
350 }
351
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
356 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
357 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
358 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
359 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
360 SelectTcgen05Ld(N, /* hasOffset */ true);
361 return true;
362 }
363 }
364}
365
366// Map ISD:CONDCODE value to appropriate CmpMode expected by
367// NVPTXInstPrinter::printCmpMode()
368SDValue NVPTXDAGToDAGISel::getPTXCmpMode(const CondCodeSDNode &CondCode) {
369 using NVPTX::PTXCmpMode::CmpMode;
370 const unsigned PTXCmpMode = [](ISD::CondCode CC) {
371 switch (CC) {
372 default:
373 llvm_unreachable("Unexpected condition code.");
374 case ISD::SETOEQ:
375 case ISD::SETEQ:
376 return CmpMode::EQ;
377 case ISD::SETOGT:
378 case ISD::SETGT:
379 return CmpMode::GT;
380 case ISD::SETOGE:
381 case ISD::SETGE:
382 return CmpMode::GE;
383 case ISD::SETOLT:
384 case ISD::SETLT:
385 return CmpMode::LT;
386 case ISD::SETOLE:
387 case ISD::SETLE:
388 return CmpMode::LE;
389 case ISD::SETONE:
390 case ISD::SETNE:
391 return CmpMode::NE;
392 case ISD::SETO:
393 return CmpMode::NUM;
394 case ISD::SETUO:
395 return CmpMode::NotANumber;
396 case ISD::SETUEQ:
397 return CmpMode::EQU;
398 case ISD::SETUGT:
399 return CmpMode::GTU;
400 case ISD::SETUGE:
401 return CmpMode::GEU;
402 case ISD::SETULT:
403 return CmpMode::LTU;
404 case ISD::SETULE:
405 return CmpMode::LEU;
406 case ISD::SETUNE:
407 return CmpMode::NEU;
408 }
409 }(CondCode.get());
410 return CurDAG->getTargetConstant(Val: PTXCmpMode, DL: SDLoc(), VT: MVT::i32);
411}
412
413bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
414 SDValue PTXCmpMode = getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)));
415 SDLoc DL(N);
416 SDNode *SetP = CurDAG->getMachineNode(
417 Opcode: NVPTX::SETP_f16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1,
418 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1), PTXCmpMode,
419 CurDAG->getTargetConstant(Val: useF32FTZ() ? 1 : 0, DL, VT: MVT::i1)});
420 ReplaceNode(F: N, T: SetP);
421 return true;
422}
423
424bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
425 SDValue PTXCmpMode = getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)));
426 SDLoc DL(N);
427 SDNode *SetP = CurDAG->getMachineNode(
428 Opcode: NVPTX::SETP_bf16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1,
429 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1), PTXCmpMode,
430 CurDAG->getTargetConstant(Val: useF32FTZ() ? 1 : 0, DL, VT: MVT::i1)});
431 ReplaceNode(F: N, T: SetP);
432 return true;
433}
434
435bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
436 SDValue Vector = N->getOperand(Num: 0);
437 MVT EltVT = N->getSimpleValueType(ResNo: 0);
438
439 MachineSDNode *N2 =
440 CurDAG->getMachineNode(Opcode: NVPTX::I64toV2I32, dl: SDLoc(N), VT1: EltVT, VT2: EltVT, Ops: Vector);
441
442 ReplaceNode(F: N, T: N2);
443 return true;
444}
445
446// Find all instances of extract_vector_elt that use this v2f16 vector
447// and coalesce them into a scattering move instruction.
448bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
449 SDValue Vector = N->getOperand(Num: 0);
450
451 MVT VT = Vector.getSimpleValueType();
452 if (!(NVPTX::isPackedVectorTy(VT) && VT.getVectorNumElements() == 2))
453 return false;
454
455 unsigned Opcode;
456 if (VT.is32BitVector())
457 Opcode = NVPTX::I32toV2I16;
458 else if (VT.is64BitVector())
459 Opcode = NVPTX::I64toV2I32;
460 else
461 llvm_unreachable("Unhandled packed type");
462
463 // Find and record all uses of this vector that extract element 0 or 1.
464 SmallVector<SDNode *, 4> E0, E1;
465 for (auto *U : Vector.getNode()->users()) {
466 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
467 continue;
468 if (U->getOperand(Num: 0) != Vector)
469 continue;
470 if (const ConstantSDNode *IdxConst =
471 dyn_cast<ConstantSDNode>(Val: U->getOperand(Num: 1))) {
472 if (IdxConst->getZExtValue() == 0)
473 E0.push_back(Elt: U);
474 else if (IdxConst->getZExtValue() == 1)
475 E1.push_back(Elt: U);
476 else
477 llvm_unreachable("Invalid vector index.");
478 }
479 }
480
481 // There's no point scattering f16x2 if we only ever access one
482 // element of it.
483 if (E0.empty() || E1.empty())
484 return false;
485
486 // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
487 // into EltTy,EltTy Split[EltTy]x2(V)
488 MVT EltVT = VT.getVectorElementType();
489 SDNode *ScatterOp =
490 CurDAG->getMachineNode(Opcode, dl: SDLoc(N), VT1: EltVT, VT2: EltVT, Ops: Vector);
491 for (auto *Node : E0)
492 ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 0));
493 for (auto *Node : E1)
494 ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 1));
495
496 return true;
497}
498
499NVPTX::AddressSpace NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) {
500 auto AS =
501 static_cast<NVPTX::AddressSpace>(N->getMemOperand()->getAddrSpace());
502 switch (AS) {
503 case NVPTX::AddressSpace::Generic:
504 case NVPTX::AddressSpace::Global:
505 case NVPTX::AddressSpace::Shared:
506 case NVPTX::AddressSpace::Const:
507 case NVPTX::AddressSpace::Local:
508 case NVPTX::AddressSpace::SharedCluster:
509 case NVPTX::AddressSpace::EntryParam:
510 case NVPTX::AddressSpace::DeviceParam:
511 return AS;
512 }
513 llvm_unreachable("Unexpected address space");
514}
515
516NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
517 // No "sem" orderings for SM/PTX versions which do not support memory ordering
518 if (!Subtarget->hasMemoryOrdering())
519 return NVPTX::Ordering::NotAtomic;
520 auto Ordering = N->getMergedOrdering();
521 switch (Ordering) {
522 case AtomicOrdering::NotAtomic:
523 return NVPTX::Ordering::NotAtomic;
524 case AtomicOrdering::Unordered:
525 case AtomicOrdering::Monotonic:
526 return NVPTX::Ordering::Relaxed;
527 case AtomicOrdering::Acquire:
528 return NVPTX::Ordering::Acquire;
529 case AtomicOrdering::Release:
530 return NVPTX::Ordering::Release;
531 case AtomicOrdering::AcquireRelease:
532 return NVPTX::Ordering::AcquireRelease;
533 case AtomicOrdering::SequentiallyConsistent:
534 return NVPTX::Ordering::SequentiallyConsistent;
535 }
536 llvm_unreachable("Invalid atomic ordering");
537}
538
539// Clusters contain exactly 1 block on targets without cluster support.
540static NVPTX::Scope resolveScope(NVPTX::Scope S, const NVPTXSubtarget *T) {
541 if (S == NVPTX::Scope::Cluster && !T->hasClusters())
542 return NVPTX::Scope::Block;
543 return S;
544}
545
546NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
547 if (!Subtarget->hasAtomScope())
548 return NVPTX::Scope::DefaultDevice;
549 return resolveScope(S: Scopes[N->getSyncScopeID()], T: Subtarget);
550}
551
552namespace {
553
554struct OperationOrderings {
555 NVPTX::Ordering InstructionOrdering, FenceOrdering;
556 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
557 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
558 : InstructionOrdering(IO), FenceOrdering(FO) {}
559};
560
561static OperationOrderings
562getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
563 AtomicOrdering Ordering = N->getSuccessOrdering();
564 auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
565
566 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
567 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
568
569 // clang-format off
570
571 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
572 // Note: uses of Relaxed in the Atomic column of this table refer
573 // to LLVM AtomicOrdering::Monotonic.
574 //
575 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
576 // |---------|----------|--------------------|------------|------------------------------|
577 // | No | No | All | plain | .weak |
578 // | No | Yes | Generic,Shared, | .volatile | .volatile |
579 // | | | Global [0] | | |
580 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
581 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
582 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
583 // | | | Global [0] | | |
584 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
585 // | | | Global [0] | | |
586 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
587 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
588 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
589 // | | | | | or .volatile (PTX 8.1-) |
590 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
591 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
592 // | | | / Global [0] | | |
593
594 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
595 // by following the ABI proven sound in:
596 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
597 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
598 //
599 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
600 // |------------------------------------------------------|-------------------------------|
601 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
602 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
603 // |------------------------------------------------------|-------------------------------|
604 // | cuda::atomic_load | fence.sc.<scope>; |
605 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
606 // |------------------------------------------------------|-------------------------------|
607 // | cuda::atomic_store | fence.sc.<scope>; |
608 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
609 // |------------------------------------------------------|-------------------------------|
610 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
611 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
612
613 // clang-format on
614
615 // [0]: volatile and atomics are only supported on global or shared
616 // memory locations, accessed via generic/shared/global pointers.
617 // MMIO is only supported on global memory locations,
618 // accessed via generic/global pointers.
619 // TODO: Implement MMIO access via generic pointer to global.
620 // Currently implemented for global pointers only.
621
622 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
623 // PTX instructions fails to preserve their C++ side-effects.
624 //
625 // Example (https://github.com/llvm/llvm-project/issues/62057):
626 //
627 // void example() {
628 // std::atomic<bool> True = true;
629 // while (True.load(std::memory_order_relaxed));
630 // }
631 //
632 // A C++ program that calls "example" is well-defined: the infinite loop
633 // performs an atomic operation. By lowering volatile/atomics to
634 // "weak" memory operations, we are transforming the above into:
635 //
636 // void undefined_behavior() {
637 // bool True = true;
638 // while (True);
639 // }
640 //
641 // which exhibits undefined behavior in both C++ and PTX.
642 //
643 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
644 // behavior due to lack of Independent Forward Progress. Lowering these
645 // to weak memory operations in sm_60- is therefore fine.
646 //
647 // TODO: lower atomic and volatile operations to memory locations
648 // in local, const, and param to two PTX instructions in sm_70+:
649 // - the "weak" memory instruction we are currently lowering to, and
650 // - some other instruction that preserves the side-effect, e.g.,
651 // a dead dummy volatile load.
652 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
653 CodeAddrSpace == NVPTX::AddressSpace::Const ||
654 CodeAddrSpace == NVPTX::AddressSpace::EntryParam ||
655 CodeAddrSpace == NVPTX::AddressSpace::DeviceParam) {
656 return NVPTX::Ordering::NotAtomic;
657 }
658
659 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
660 // supported on sm_60 and older; this includes volatile atomics.
661 if (!(Ordering == AtomicOrdering::NotAtomic ||
662 Ordering == AtomicOrdering::Unordered ||
663 Ordering == AtomicOrdering::Monotonic) &&
664 !HasMemoryOrdering) {
665 report_fatal_error(
666 reason: formatv(Fmt: "PTX does not support \"atomic\" for orderings different than"
667 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
668 "is: \"{}\".",
669 Vals: toIRString(ao: Ordering)));
670 }
671
672 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
673 // the volatile semantics and preserve the atomic ones.
674
675 // PTX volatile and PTX atomics are not available for statespace that differ
676 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
677 // atomics is undefined if the generic address does not refer to a .global or
678 // .shared memory location.
679 bool AddrGenericOrGlobalOrShared =
680 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
681 CodeAddrSpace == NVPTX::AddressSpace::Global ||
682 CodeAddrSpace == NVPTX::AddressSpace::Shared ||
683 CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
684 if (!AddrGenericOrGlobalOrShared)
685 return NVPTX::Ordering::NotAtomic;
686
687 bool UseRelaxedMMIO =
688 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
689
690 switch (Ordering) {
691 case AtomicOrdering::NotAtomic:
692 return N->isVolatile() ? NVPTX::Ordering::Volatile
693 : NVPTX::Ordering::NotAtomic;
694 case AtomicOrdering::Unordered:
695 // We lower unordered in the exact same way as 'monotonic' to respect
696 // LLVM IR atomicity requirements.
697 case AtomicOrdering::Monotonic:
698 if (N->isVolatile())
699 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
700 : NVPTX::Ordering::Volatile;
701 else
702 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
703 : NVPTX::Ordering::Volatile;
704 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
705 // Acquire.
706 case AtomicOrdering::Acquire:
707 if (!N->readMem())
708 report_fatal_error(
709 reason: formatv(Fmt: "PTX only supports Acquire Ordering on reads: {}",
710 Vals: N->getOperationName()));
711 return NVPTX::Ordering::Acquire;
712 case AtomicOrdering::Release:
713 if (!N->writeMem())
714 report_fatal_error(
715 reason: formatv(Fmt: "PTX only supports Release Ordering on writes: {}",
716 Vals: N->getOperationName()));
717 return NVPTX::Ordering::Release;
718 case AtomicOrdering::AcquireRelease: {
719 report_fatal_error(
720 reason: formatv(Fmt: "NVPTX does not support AcquireRelease Ordering on "
721 "read-modify-write "
722 "yet and PTX does not support it on loads or stores: {}",
723 Vals: N->getOperationName()));
724 }
725 case AtomicOrdering::SequentiallyConsistent: {
726 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
727 // sequence including a "fence.sc.sco" and the memory instruction with an
728 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
729 // whether the memory operation is a read, write, or read-modify-write.
730 //
731 // This sets the ordering of the fence to SequentiallyConsistent, and
732 // sets the corresponding ordering for the instruction.
733 NVPTX::Ordering InstrOrder;
734 if (N->readMem())
735 InstrOrder = NVPTX::Ordering::Acquire;
736 else if (N->writeMem())
737 InstrOrder = NVPTX::Ordering::Release;
738 else
739 report_fatal_error(
740 reason: formatv(Fmt: "NVPTX does not support SequentiallyConsistent Ordering on "
741 "read-modify-writes yet: {}",
742 Vals: N->getOperationName()));
743 return OperationOrderings(InstrOrder,
744 NVPTX::Ordering::SequentiallyConsistent);
745 }
746 }
747 report_fatal_error(
748 reason: formatv(Fmt: "NVPTX backend does not support AtomicOrdering \"{}\" yet.",
749 Vals: toIRString(ao: Ordering)));
750}
751
752} // namespace
753
754NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
755 NVPTX::Ordering O) const {
756 switch (O) {
757 case NVPTX::Ordering::NotAtomic:
758 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
759 // NVPTX uses Thread scope as the scope of non-atomic operations.
760 return NVPTX::Scope::Thread;
761 case NVPTX::Ordering::RelaxedMMIO:
762 // RelaxedMMIO operations are always system scope.
763 // If a RelaxedMMIO order was generated from an atomic volatile operation
764 // with a smaller thread scope, we bump it here to system scope.
765 return NVPTX::Scope::System;
766 case NVPTX::Ordering::Relaxed:
767 case NVPTX::Ordering::Acquire:
768 case NVPTX::Ordering::Release:
769 case NVPTX::Ordering::AcquireRelease:
770 case NVPTX::Ordering::SequentiallyConsistent:
771 auto S = Scopes[N->getSyncScopeID()];
772
773 S = resolveScope(S, T: Subtarget);
774
775 // If operation is volatile, then its scope is system.
776 return N->isVolatile() ? NVPTX::Scope::System : S;
777 }
778 llvm_unreachable("unhandled ordering");
779}
780
781static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
782 NVPTX::AddressSpace CodeAddrSpace) {
783 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
784 // space.
785 return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
786 N.isInvariant();
787}
788
789static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
790 NVPTXSubtarget const *T) {
791 S = resolveScope(S, T);
792
793 // Fall back to .acq_rel if .acquire, .release is not supported.
794 if (!T->hasSplitAcquireAndReleaseFences() &&
795 (O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release))
796 O = NVPTX::Ordering::AcquireRelease;
797
798 switch (O) {
799 case NVPTX::Ordering::Acquire:
800 switch (S) {
801 case NVPTX::Scope::System:
802 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
803 : NVPTX::INT_MEMBAR_SYS;
804 case NVPTX::Scope::Block:
805 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
806 : NVPTX::INT_MEMBAR_CTA;
807 case NVPTX::Scope::Cluster:
808 return NVPTX::atomic_thread_fence_acquire_cluster;
809 case NVPTX::Scope::Device:
810 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
811 : NVPTX::INT_MEMBAR_GL;
812 case NVPTX::Scope::Thread:
813 case NVPTX::Scope::DefaultDevice:
814 report_fatal_error(
815 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
816 Vals: ScopeToString(S)));
817 }
818 break;
819 case NVPTX::Ordering::Release:
820 switch (S) {
821 case NVPTX::Scope::System:
822 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
823 : NVPTX::INT_MEMBAR_SYS;
824 case NVPTX::Scope::Block:
825 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
826 : NVPTX::INT_MEMBAR_CTA;
827 case NVPTX::Scope::Cluster:
828 return NVPTX::atomic_thread_fence_release_cluster;
829 case NVPTX::Scope::Device:
830 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
831 : NVPTX::INT_MEMBAR_GL;
832 case NVPTX::Scope::Thread:
833 case NVPTX::Scope::DefaultDevice:
834 report_fatal_error(
835 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
836 Vals: ScopeToString(S)));
837 }
838 break;
839 case NVPTX::Ordering::AcquireRelease: {
840 switch (S) {
841 case NVPTX::Scope::System:
842 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
843 : NVPTX::INT_MEMBAR_SYS;
844 case NVPTX::Scope::Block:
845 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
846 : NVPTX::INT_MEMBAR_CTA;
847 case NVPTX::Scope::Cluster:
848 return NVPTX::atomic_thread_fence_acq_rel_cluster;
849 case NVPTX::Scope::Device:
850 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
851 : NVPTX::INT_MEMBAR_GL;
852 case NVPTX::Scope::Thread:
853 case NVPTX::Scope::DefaultDevice:
854 report_fatal_error(
855 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
856 Vals: ScopeToString(S)));
857 }
858 break;
859 }
860 case NVPTX::Ordering::SequentiallyConsistent: {
861 switch (S) {
862 case NVPTX::Scope::System:
863 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
864 : NVPTX::INT_MEMBAR_SYS;
865 case NVPTX::Scope::Block:
866 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
867 : NVPTX::INT_MEMBAR_CTA;
868 case NVPTX::Scope::Cluster:
869 return NVPTX::atomic_thread_fence_seq_cst_cluster;
870 case NVPTX::Scope::Device:
871 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
872 : NVPTX::INT_MEMBAR_GL;
873 case NVPTX::Scope::Thread:
874 case NVPTX::Scope::DefaultDevice:
875 report_fatal_error(reason: formatv(Fmt: "Unsupported scope \"{}\" for seq_cst fence.",
876 Vals: ScopeToString(S)));
877 }
878 break;
879 }
880 case NVPTX::Ordering::NotAtomic:
881 case NVPTX::Ordering::Relaxed:
882 case NVPTX::Ordering::Volatile:
883 case NVPTX::Ordering::RelaxedMMIO:
884 report_fatal_error(
885 reason: formatv(Fmt: "Unsupported \"{}\" ordering and \"{}\" scope for fence.",
886 Vals: OrderingToString(Order: O), Vals: ScopeToString(S)));
887 }
888 llvm_unreachable("unhandled ordering");
889}
890
891// Returns Memory Order and Scope of a memory instruction, and
892// inserts any fence before the instruction that's required to
893// implement its memory ordering.
894std::pair<NVPTX::Ordering, NVPTX::Scope>
895NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
896 MemSDNode *N) {
897 auto [InstructionOrdering, FenceOrdering] =
898 getOperationOrderings(N, Subtarget);
899 auto Scope = getOperationScope(N, O: InstructionOrdering);
900
901 // Singlethread scope has no inter-thread synchronization requirements, so
902 // the atomic operation is lowered as plain and the fence is skipped.
903 // NotAtomic and Volatile operations naturally have Thread scope and must
904 // preserve their ordering.
905 if (Scope == NVPTX::Scope::Thread &&
906 InstructionOrdering != NVPTX::Ordering::NotAtomic &&
907 InstructionOrdering != NVPTX::Ordering::Volatile)
908 return {NVPTX::Ordering::NotAtomic, Scope};
909
910 // If a fence is required before the operation, insert it:
911 switch (NVPTX::Ordering(FenceOrdering)) {
912 case NVPTX::Ordering::NotAtomic:
913 break;
914 case NVPTX::Ordering::SequentiallyConsistent: {
915 auto Op = getFenceOp(O: FenceOrdering, S: Scope, T: Subtarget);
916 Chain = SDValue(CurDAG->getMachineNode(Opcode: Op, dl: DL, VT: MVT::Other, Op1: Chain), 0);
917 break;
918 }
919 default:
920 report_fatal_error(
921 reason: formatv(Fmt: "Unexpected fence ordering: \"{}\".",
922 Vals: OrderingToString(Order: NVPTX::Ordering(FenceOrdering))));
923 }
924 return {InstructionOrdering, Scope};
925}
926
927void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
928 SDValue Src = N->getOperand(Num: 0);
929 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(Val: N);
930 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
931 unsigned DstAddrSpace = CastN->getDestAddressSpace();
932 SDLoc DL(N);
933 assert(SrcAddrSpace != DstAddrSpace &&
934 "addrspacecast must be between different address spaces");
935
936 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
937 // Specific to generic
938
939 if (TM.is64Bit() && TM.getPointerSizeInBits(AS: SrcAddrSpace) == 32) {
940 SDValue CvtNone =
941 CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, VT: MVT::i32);
942 SDNode *Cvt = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u64_u32, dl: DL, VT: MVT::i64,
943 Op1: Src, Op2: CvtNone);
944 Src = SDValue(Cvt, 0);
945 }
946
947 unsigned Opc;
948 switch (SrcAddrSpace) {
949 default: report_fatal_error(reason: "Bad address space in addrspacecast");
950 case ADDRESS_SPACE_GLOBAL:
951 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
952 break;
953 case ADDRESS_SPACE_SHARED:
954 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
955 break;
956 case ADDRESS_SPACE_SHARED_CLUSTER:
957 if (!TM.is64Bit())
958 report_fatal_error(
959 reason: "Shared cluster address space is only supported in 64-bit mode");
960 Opc = NVPTX::cvta_shared_cluster_64;
961 break;
962 case ADDRESS_SPACE_CONST:
963 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
964 break;
965 case ADDRESS_SPACE_LOCAL:
966 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
967 break;
968 case ADDRESS_SPACE_ENTRY_PARAM:
969 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
970 break;
971 }
972 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: N->getValueType(ResNo: 0), Op1: Src));
973 return;
974 } else {
975 // Generic to specific
976 if (SrcAddrSpace != 0)
977 report_fatal_error(reason: "Cannot cast between two non-generic address spaces");
978 unsigned Opc;
979 switch (DstAddrSpace) {
980 default: report_fatal_error(reason: "Bad address space in addrspacecast");
981 case ADDRESS_SPACE_GLOBAL:
982 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
983 break;
984 case ADDRESS_SPACE_SHARED:
985 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
986 break;
987 case ADDRESS_SPACE_SHARED_CLUSTER:
988 if (!TM.is64Bit())
989 report_fatal_error(
990 reason: "Shared cluster address space is only supported in 64-bit mode");
991 Opc = NVPTX::cvta_to_shared_cluster_64;
992 break;
993 case ADDRESS_SPACE_CONST:
994 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
995 break;
996 case ADDRESS_SPACE_LOCAL:
997 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
998 break;
999 case ADDRESS_SPACE_ENTRY_PARAM:
1000 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1001 break;
1002 }
1003
1004 SDNode *CVTA = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: N->getValueType(ResNo: 0), Op1: Src);
1005 if (TM.is64Bit() && TM.getPointerSizeInBits(AS: DstAddrSpace) == 32) {
1006 SDValue CvtNone =
1007 CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, VT: MVT::i32);
1008 CVTA = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u32_u64, dl: DL, VT: MVT::i32,
1009 Op1: SDValue(CVTA, 0), Op2: CvtNone);
1010 }
1011
1012 ReplaceNode(F: N, T: CVTA);
1013 return;
1014 }
1015}
1016
1017// Helper function template to reduce amount of boilerplate code for
1018// opcode selection.
1019static std::optional<unsigned>
1020pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
1021 std::optional<unsigned> Opcode_i32,
1022 std::optional<unsigned> Opcode_i64) {
1023 switch (VT) {
1024 case MVT::f16:
1025 case MVT::i16:
1026 case MVT::bf16:
1027 return Opcode_i16;
1028 case MVT::v2f16:
1029 case MVT::v2bf16:
1030 case MVT::v2i16:
1031 case MVT::v4i8:
1032 case MVT::i32:
1033 case MVT::f32:
1034 return Opcode_i32;
1035 case MVT::v2f32:
1036 case MVT::v2i32:
1037 case MVT::i64:
1038 case MVT::f64:
1039 return Opcode_i64;
1040 default:
1041 return std::nullopt;
1042 }
1043}
1044
1045static inline bool isAddLike(const SDValue V) {
1046 return V.getOpcode() == ISD::ADD ||
1047 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
1048}
1049
1050static SDValue stripAssertAlign(SDValue N) {
1051 if (N.getOpcode() == ISD::AssertAlign)
1052 N = N.getOperand(i: 0);
1053 return N;
1054}
1055
1056// selectBaseADDR - Match a dag node which will serve as the base address for an
1057// ADDR operand pair.
1058static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
1059 N = stripAssertAlign(N);
1060 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N))
1061 return DAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N),
1062 VT: GA->getValueType(ResNo: 0), offset: GA->getOffset(),
1063 TargetFlags: GA->getTargetFlags());
1064 if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: N))
1065 return DAG->getTargetExternalSymbol(Sym: ES->getSymbol(), VT: ES->getValueType(ResNo: 0),
1066 TargetFlags: ES->getTargetFlags());
1067 if (const auto *FIN = dyn_cast<FrameIndexSDNode>(Val&: N))
1068 return DAG->getTargetFrameIndex(FI: FIN->getIndex(), VT: FIN->getValueType(ResNo: 0));
1069
1070 return N;
1071}
1072
1073static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
1074 Addr = stripAssertAlign(N: Addr);
1075 APInt AccumulatedOffset(64u, 0);
1076 while (isAddLike(V: Addr)) {
1077 const auto *CN = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1078 if (!CN)
1079 break;
1080
1081 const APInt CI = CN->getAPIntValue().sext(width: 64);
1082 if (!(CI + AccumulatedOffset).isSignedIntN(N: 32))
1083 break;
1084
1085 AccumulatedOffset += CI;
1086 Addr = stripAssertAlign(N: Addr->getOperand(Num: 0));
1087 }
1088 return DAG->getSignedTargetConstant(Val: AccumulatedOffset.getSExtValue(), DL,
1089 VT: MVT::i32);
1090}
1091
1092static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
1093 SDValue Offset = accumulateOffset(Addr, DL: SDLoc(Addr), DAG);
1094 SDValue Base = selectBaseADDR(N: Addr, DAG);
1095 return {Base, Offset};
1096}
1097
1098// Select a pair of operands which represent a valid PTX address, this could be
1099// one of the following things:
1100// - [var] - Offset is simply set to 0
1101// - [reg] - Offset is simply set to 0
1102// - [reg+immOff]
1103// - [var+immOff]
1104// Note that immOff must fit into a 32-bit signed integer.
1105bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
1106 SDValue &Offset) {
1107 std::tie(args&: Base, args&: Offset) = selectADDR(Addr, DAG: CurDAG);
1108 return true;
1109}
1110
1111bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1112 MemSDNode *LD = cast<MemSDNode>(Val: N);
1113 assert(LD->readMem() && "Expected load");
1114
1115 // do not support pre/post inc/dec
1116 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(Val: LD);
1117 if (PlainLoad && PlainLoad->isIndexed())
1118 return false;
1119
1120 // Address Space Setting
1121 const auto CodeAddrSpace = getAddrSpace(N: LD);
1122 if (canLowerToLDG(N: *LD, Subtarget: *Subtarget, CodeAddrSpace))
1123 return tryLDG(N: LD);
1124
1125 SDLoc DL(LD);
1126 SDValue Chain = N->getOperand(Num: 0);
1127 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: LD);
1128
1129 const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
1130
1131 // Vector Setting
1132 const unsigned FromType =
1133 (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1134 ? NVPTX::PTXLdStInstCode::Signed
1135 : NVPTX::PTXLdStInstCode::Untyped;
1136
1137 uint32_t UsedBytesMask;
1138 switch (N->getOpcode()) {
1139 case ISD::LOAD:
1140 case ISD::ATOMIC_LOAD:
1141 UsedBytesMask = UINT32_MAX;
1142 break;
1143 case NVPTXISD::MLoad:
1144 UsedBytesMask = N->getConstantOperandVal(Num: 3);
1145 break;
1146 default:
1147 llvm_unreachable("Unexpected opcode");
1148 }
1149
1150 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1151 FromTypeWidth <= 128 && "Invalid width for load");
1152
1153 // Create the machine instruction DAG
1154 const auto [Base, Offset] = selectADDR(Addr: N->getOperand(Num: 1), DAG: CurDAG);
1155 SDValue Ops[] = {getI32Imm(Imm: Ordering, DL),
1156 getI32Imm(Imm: Scope, DL),
1157 getI32Imm(Imm: CodeAddrSpace, DL),
1158 getI32Imm(Imm: FromType, DL),
1159 getI32Imm(Imm: FromTypeWidth, DL),
1160 getI32Imm(Imm: UsedBytesMask, DL),
1161 Base,
1162 Offset,
1163 Chain};
1164
1165 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy;
1166 const std::optional<unsigned> Opcode =
1167 pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LD_i16, Opcode_i32: NVPTX::LD_i32, Opcode_i64: NVPTX::LD_i64);
1168 if (!Opcode)
1169 return false;
1170
1171 SDNode *NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1172 if (!NVPTXLD)
1173 return false;
1174
1175 MachineMemOperand *MemRef = LD->getMemOperand();
1176 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef});
1177
1178 ReplaceNode(F: LD, T: NVPTXLD);
1179 return true;
1180}
1181
1182static unsigned getStoreVectorNumElts(SDNode *N) {
1183 switch (N->getOpcode()) {
1184 case NVPTXISD::StoreV2:
1185 return 2;
1186 case NVPTXISD::StoreV4:
1187 return 4;
1188 case NVPTXISD::StoreV8:
1189 return 8;
1190 default:
1191 llvm_unreachable("Unexpected opcode");
1192 }
1193}
1194
1195bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1196 MemSDNode *LD = cast<MemSDNode>(Val: N);
1197
1198 // Address Space Setting
1199 const auto CodeAddrSpace = getAddrSpace(N: LD);
1200 if (canLowerToLDG(N: *LD, Subtarget: *Subtarget, CodeAddrSpace))
1201 return tryLDG(N: LD);
1202
1203 const MVT EltVT = LD->getSimpleValueType(ResNo: 0);
1204 SDLoc DL(LD);
1205 SDValue Chain = LD->getChain();
1206 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: LD);
1207
1208 // Type Setting: fromType + fromTypeWidth
1209 //
1210 // Sign : ISD::SEXTLOAD
1211 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1212 // type is integer
1213 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1214 // Read at least 8 bits (predicates are stored as 8-bit values)
1215 // Get the original LoadSDNode::getExtensionType() value
1216 const unsigned ExtensionType = N->getConstantOperandVal(Num: 4);
1217 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1218 ? NVPTX::PTXLdStInstCode::Signed
1219 : NVPTX::PTXLdStInstCode::Untyped;
1220
1221 const unsigned FromTypeWidth = getFromTypeWidthForLoad(Mem: LD);
1222 const uint32_t UsedBytesMask = N->getConstantOperandVal(Num: 3);
1223
1224 assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1225
1226 const auto [Base, Offset] = selectADDR(Addr: N->getOperand(Num: 1), DAG: CurDAG);
1227 SDValue Ops[] = {getI32Imm(Imm: Ordering, DL),
1228 getI32Imm(Imm: Scope, DL),
1229 getI32Imm(Imm: CodeAddrSpace, DL),
1230 getI32Imm(Imm: FromType, DL),
1231 getI32Imm(Imm: FromTypeWidth, DL),
1232 getI32Imm(Imm: UsedBytesMask, DL),
1233 Base,
1234 Offset,
1235 Chain};
1236
1237 std::optional<unsigned> Opcode;
1238 switch (N->getOpcode()) {
1239 default:
1240 llvm_unreachable("Unexpected opcode");
1241 case NVPTXISD::LoadV2:
1242 Opcode = pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i16: NVPTX::LDV_i16_v2,
1243 Opcode_i32: NVPTX::LDV_i32_v2, Opcode_i64: NVPTX::LDV_i64_v2);
1244 break;
1245 case NVPTXISD::LoadV4:
1246 Opcode = pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i16: NVPTX::LDV_i16_v4,
1247 Opcode_i32: NVPTX::LDV_i32_v4, Opcode_i64: NVPTX::LDV_i64_v4);
1248 break;
1249 case NVPTXISD::LoadV8:
1250 Opcode = pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i16: {/* no v8i16 */},
1251 Opcode_i32: NVPTX::LDV_i32_v8, Opcode_i64: {/* no v8i64 */});
1252 break;
1253 }
1254 if (!Opcode)
1255 return false;
1256
1257 SDNode *NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1258
1259 MachineMemOperand *MemRef = LD->getMemOperand();
1260 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef});
1261
1262 ReplaceNode(F: LD, T: NVPTXLD);
1263 return true;
1264}
1265
1266bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1267 SDLoc DL(LD);
1268
1269 unsigned ExtensionType;
1270 uint32_t UsedBytesMask;
1271 if (const auto *Load = dyn_cast<LoadSDNode>(Val: LD)) {
1272 ExtensionType = Load->getExtensionType();
1273 UsedBytesMask = UINT32_MAX;
1274 } else {
1275 ExtensionType = LD->getConstantOperandVal(Num: 4);
1276 UsedBytesMask = LD->getConstantOperandVal(Num: 3);
1277 }
1278 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1279 ? NVPTX::PTXLdStInstCode::Signed
1280 : NVPTX::PTXLdStInstCode::Untyped;
1281
1282 const unsigned FromTypeWidth = getFromTypeWidthForLoad(Mem: LD);
1283
1284 assert(!(LD->getSimpleValueType(0).isVector() &&
1285 ExtensionType != ISD::NON_EXTLOAD));
1286
1287 const auto [Base, Offset] = selectADDR(Addr: LD->getOperand(Num: 1), DAG: CurDAG);
1288 SDValue Ops[] = {getI32Imm(Imm: FromType, DL),
1289 getI32Imm(Imm: FromTypeWidth, DL),
1290 getI32Imm(Imm: UsedBytesMask, DL),
1291 Base,
1292 Offset,
1293 LD->getChain()};
1294
1295 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy;
1296 std::optional<unsigned> Opcode;
1297 switch (LD->getOpcode()) {
1298 default:
1299 llvm_unreachable("Unexpected opcode");
1300 case ISD::LOAD:
1301 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LD_GLOBAL_NC_i16,
1302 Opcode_i32: NVPTX::LD_GLOBAL_NC_i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_i64);
1303 break;
1304 case NVPTXISD::MLoad:
1305 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: std::nullopt, Opcode_i32: NVPTX::LD_GLOBAL_NC_i32,
1306 Opcode_i64: NVPTX::LD_GLOBAL_NC_i64);
1307 break;
1308 case NVPTXISD::LoadV2:
1309 Opcode =
1310 pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LD_GLOBAL_NC_v2i16,
1311 Opcode_i32: NVPTX::LD_GLOBAL_NC_v2i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_v2i64);
1312 break;
1313 case NVPTXISD::LoadV4:
1314 Opcode =
1315 pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LD_GLOBAL_NC_v4i16,
1316 Opcode_i32: NVPTX::LD_GLOBAL_NC_v4i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_v4i64);
1317 break;
1318 case NVPTXISD::LoadV8:
1319 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: {/* no v8i16 */},
1320 Opcode_i32: NVPTX::LD_GLOBAL_NC_v8i32, Opcode_i64: {/* no v8i64 */});
1321 break;
1322 }
1323 if (!Opcode)
1324 return false;
1325
1326 SDNode *NVPTXLDG = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1327
1328 ReplaceNode(F: LD, T: NVPTXLDG);
1329 return true;
1330}
1331
1332unsigned NVPTXDAGToDAGISel::getFromTypeWidthForLoad(const MemSDNode *Mem) {
1333 auto TotalWidth = Mem->getMemoryVT().getSizeInBits();
1334 auto NumElts = Mem->getNumValues() - 1;
1335 auto ElementBitWidth = TotalWidth / NumElts;
1336 assert(isPowerOf2_32(ElementBitWidth) && ElementBitWidth >= 8 &&
1337 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1338 "Invalid width for load");
1339 return ElementBitWidth;
1340}
1341
1342bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1343 auto *LD = cast<MemSDNode>(Val: N);
1344
1345 SDLoc DL(N);
1346 const unsigned FromTypeWidth = getFromTypeWidthForLoad(Mem: LD);
1347 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy;
1348
1349 // If this is an LDU intrinsic, the address is the third operand. If its an
1350 // LDU SD node (from custom vector handling), then its the second operand
1351 SDValue Addr =
1352 LD->getOperand(Num: LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1353
1354 const auto [Base, Offset] = selectADDR(Addr, DAG: CurDAG);
1355 SDValue Ops[] = {getI32Imm(Imm: FromTypeWidth, DL), Base, Offset, LD->getChain()};
1356
1357 std::optional<unsigned> Opcode;
1358 switch (N->getOpcode()) {
1359 default:
1360 llvm_unreachable("Unexpected opcode");
1361 case ISD::INTRINSIC_W_CHAIN:
1362 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LDU_GLOBAL_i16,
1363 Opcode_i32: NVPTX::LDU_GLOBAL_i32, Opcode_i64: NVPTX::LDU_GLOBAL_i64);
1364 break;
1365 case NVPTXISD::LDUV2:
1366 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LDU_GLOBAL_v2i16,
1367 Opcode_i32: NVPTX::LDU_GLOBAL_v2i32, Opcode_i64: NVPTX::LDU_GLOBAL_v2i64);
1368 break;
1369 case NVPTXISD::LDUV4:
1370 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LDU_GLOBAL_v4i16,
1371 Opcode_i32: NVPTX::LDU_GLOBAL_v4i32, Opcode_i64: {/* no v4i64 */});
1372 break;
1373 }
1374 if (!Opcode)
1375 return false;
1376
1377 SDNode *NVPTXLDU = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1378
1379 ReplaceNode(F: LD, T: NVPTXLDU);
1380 return true;
1381}
1382
1383bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1384 MemSDNode *ST = cast<MemSDNode>(Val: N);
1385 assert(ST->writeMem() && "Expected store");
1386 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(Val: ST);
1387 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(Val: ST);
1388 assert((PlainStore || AtomicStore) && "Expected store");
1389
1390 // do not support pre/post inc/dec
1391 if (PlainStore && PlainStore->isIndexed())
1392 return false;
1393
1394 // Address Space Setting
1395 const auto CodeAddrSpace = getAddrSpace(N: ST);
1396
1397 SDLoc DL(ST);
1398 SDValue Chain = ST->getChain();
1399 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: ST);
1400
1401 // Vector Setting
1402 const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
1403
1404 // Create the machine instruction DAG
1405 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1406
1407 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1408 "Invalid width for store");
1409
1410 const auto [Base, Offset] = selectADDR(Addr: ST->getBasePtr(), DAG: CurDAG);
1411 SDValue Ops[] = {selectPossiblyImm(V: Value),
1412 getI32Imm(Imm: Ordering, DL),
1413 getI32Imm(Imm: Scope, DL),
1414 getI32Imm(Imm: CodeAddrSpace, DL),
1415 getI32Imm(Imm: ToTypeWidth, DL),
1416 Base,
1417 Offset,
1418 Chain};
1419
1420 const std::optional<unsigned> Opcode =
1421 pickOpcodeForVT(VT: Value.getSimpleValueType().SimpleTy, Opcode_i16: NVPTX::ST_i16,
1422 Opcode_i32: NVPTX::ST_i32, Opcode_i64: NVPTX::ST_i64);
1423 if (!Opcode)
1424 return false;
1425
1426 SDNode *NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VT: MVT::Other, Ops);
1427
1428 if (!NVPTXST)
1429 return false;
1430
1431 MachineMemOperand *MemRef = ST->getMemOperand();
1432 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef});
1433 ReplaceNode(F: ST, T: NVPTXST);
1434 return true;
1435}
1436
1437bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1438 MemSDNode *ST = cast<MemSDNode>(Val: N);
1439 const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
1440
1441 // Address Space Setting
1442 const auto CodeAddrSpace = getAddrSpace(N: ST);
1443 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1444 report_fatal_error(reason: "Cannot store to pointer that points to constant "
1445 "memory space");
1446 }
1447
1448 SDLoc DL(ST);
1449 SDValue Chain = ST->getChain();
1450 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: ST);
1451
1452 const unsigned NumElts = getStoreVectorNumElts(N: ST);
1453
1454 SmallVector<SDValue, 16> Ops;
1455 for (auto &V : ST->ops().slice(N: 1, M: NumElts))
1456 Ops.push_back(Elt: selectPossiblyImm(V));
1457 SDValue Addr = N->getOperand(Num: NumElts + 1);
1458 const unsigned ToTypeWidth = TotalWidth / NumElts;
1459
1460 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1461 TotalWidth <= 256 && "Invalid width for store");
1462
1463 const auto [Base, Offset] = selectADDR(Addr, DAG: CurDAG);
1464 Ops.append(IL: {getI32Imm(Imm: Ordering, DL), getI32Imm(Imm: Scope, DL),
1465 getI32Imm(Imm: CodeAddrSpace, DL), getI32Imm(Imm: ToTypeWidth, DL), Base,
1466 Offset, Chain});
1467
1468 const MVT::SimpleValueType EltVT =
1469 ST->getOperand(Num: 1).getSimpleValueType().SimpleTy;
1470 std::optional<unsigned> Opcode;
1471 switch (ST->getOpcode()) {
1472 default:
1473 return false;
1474 case NVPTXISD::StoreV2:
1475 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i16: NVPTX::STV_i16_v2, Opcode_i32: NVPTX::STV_i32_v2,
1476 Opcode_i64: NVPTX::STV_i64_v2);
1477 break;
1478 case NVPTXISD::StoreV4:
1479 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i16: NVPTX::STV_i16_v4, Opcode_i32: NVPTX::STV_i32_v4,
1480 Opcode_i64: NVPTX::STV_i64_v4);
1481 break;
1482 case NVPTXISD::StoreV8:
1483 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i16: {/* no v8i16 */}, Opcode_i32: NVPTX::STV_i32_v8,
1484 Opcode_i64: {/* no v8i64 */});
1485 break;
1486 }
1487
1488 if (!Opcode)
1489 return false;
1490
1491 SDNode *NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VT: MVT::Other, Ops);
1492
1493 MachineMemOperand *MemRef = ST->getMemOperand();
1494 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef});
1495
1496 ReplaceNode(F: ST, T: NVPTXST);
1497 return true;
1498}
1499
1500/// SelectBFE - Look for instruction sequences that can be made more efficient
1501/// by using the 'bfe' (bit-field extract) PTX instruction
1502bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1503 SDLoc DL(N);
1504 SDValue LHS = N->getOperand(Num: 0);
1505 SDValue RHS = N->getOperand(Num: 1);
1506 SDValue Len;
1507 SDValue Start;
1508 SDValue Val;
1509 bool IsSigned = false;
1510
1511 if (N->getOpcode() == ISD::AND) {
1512 // Canonicalize the operands
1513 // We want 'and %val, %mask'
1514 if (isa<ConstantSDNode>(Val: LHS) && !isa<ConstantSDNode>(Val: RHS)) {
1515 std::swap(a&: LHS, b&: RHS);
1516 }
1517
1518 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val&: RHS);
1519 if (!Mask) {
1520 // We need a constant mask on the RHS of the AND
1521 return false;
1522 }
1523
1524 // Extract the mask bits
1525 uint64_t MaskVal = Mask->getZExtValue();
1526 if (!isMask_64(Value: MaskVal)) {
1527 // We *could* handle shifted masks here, but doing so would require an
1528 // 'and' operation to fix up the low-order bits so we would trade
1529 // shr+and for bfe+and, which has the same throughput
1530 return false;
1531 }
1532
1533 // How many bits are in our mask?
1534 int64_t NumBits = countr_one(Value: MaskVal);
1535 Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32);
1536
1537 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
1538 // We have a 'srl/and' pair, extract the effective start bit and length
1539 Val = LHS.getNode()->getOperand(Num: 0);
1540 Start = LHS.getNode()->getOperand(Num: 1);
1541 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Val&: Start);
1542 if (StartConst) {
1543 uint64_t StartVal = StartConst->getZExtValue();
1544 // How many "good" bits do we have left? "good" is defined here as bits
1545 // that exist in the original value, not shifted in.
1546 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1547 if (NumBits > GoodBits) {
1548 // Do not handle the case where bits have been shifted in. In theory
1549 // we could handle this, but the cost is likely higher than just
1550 // emitting the srl/and pair.
1551 return false;
1552 }
1553 Start = CurDAG->getTargetConstant(Val: StartVal, DL, VT: MVT::i32);
1554 } else {
1555 // Do not handle the case where the shift amount (can be zero if no srl
1556 // was found) is not constant. We could handle this case, but it would
1557 // require run-time logic that would be more expensive than just
1558 // emitting the srl/and pair.
1559 return false;
1560 }
1561 } else {
1562 // Do not handle the case where the LHS of the and is not a shift. While
1563 // it would be trivial to handle this case, it would just transform
1564 // 'and' -> 'bfe', but 'and' has higher-throughput.
1565 return false;
1566 }
1567 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
1568 if (LHS->getOpcode() == ISD::AND) {
1569 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(Val&: RHS);
1570 if (!ShiftCnst) {
1571 // Shift amount must be constant
1572 return false;
1573 }
1574
1575 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1576
1577 SDValue AndLHS = LHS->getOperand(Num: 0);
1578 SDValue AndRHS = LHS->getOperand(Num: 1);
1579
1580 // Canonicalize the AND to have the mask on the RHS
1581 if (isa<ConstantSDNode>(Val: AndLHS)) {
1582 std::swap(a&: AndLHS, b&: AndRHS);
1583 }
1584
1585 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: AndRHS);
1586 if (!MaskCnst) {
1587 // Mask must be constant
1588 return false;
1589 }
1590
1591 uint64_t MaskVal = MaskCnst->getZExtValue();
1592 uint64_t NumZeros;
1593 uint64_t NumBits;
1594 if (isMask_64(Value: MaskVal)) {
1595 NumZeros = 0;
1596 // The number of bits in the result bitfield will be the number of
1597 // trailing ones (the AND) minus the number of bits we shift off
1598 NumBits = llvm::countr_one(Value: MaskVal) - ShiftAmt;
1599 } else if (isShiftedMask_64(Value: MaskVal)) {
1600 NumZeros = llvm::countr_zero(Val: MaskVal);
1601 unsigned NumOnes = llvm::countr_one(Value: MaskVal >> NumZeros);
1602 // The number of bits in the result bitfield will be the number of
1603 // trailing zeros plus the number of set bits in the mask minus the
1604 // number of bits we shift off
1605 NumBits = NumZeros + NumOnes - ShiftAmt;
1606 } else {
1607 // This is not a mask we can handle
1608 return false;
1609 }
1610
1611 if (ShiftAmt < NumZeros) {
1612 // Handling this case would require extra logic that would make this
1613 // transformation non-profitable
1614 return false;
1615 }
1616
1617 Val = AndLHS;
1618 Start = CurDAG->getTargetConstant(Val: ShiftAmt, DL, VT: MVT::i32);
1619 Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32);
1620
1621 // If pre-shift AND includes the sign bit in the bitfield, we must use
1622 // signed BFE to replicate that bit during bitfield extraction. If the
1623 // sign bit is not part of the mask, unsigned BFE will zero out upper bits
1624 // of the result
1625 if (N->getOpcode() == ISD::SRA)
1626 IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1627 } else if (LHS->getOpcode() == ISD::SHL) {
1628 // Here, we have a pattern like:
1629 //
1630 // (sra (shl val, NN), MM)
1631 // or
1632 // (srl (shl val, NN), MM)
1633 //
1634 // If MM >= NN, we can efficiently optimize this with bfe
1635 Val = LHS->getOperand(Num: 0);
1636
1637 SDValue ShlRHS = LHS->getOperand(Num: 1);
1638 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(Val&: ShlRHS);
1639 if (!ShlCnst) {
1640 // Shift amount must be constant
1641 return false;
1642 }
1643 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1644
1645 SDValue ShrRHS = RHS;
1646 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(Val&: ShrRHS);
1647 if (!ShrCnst) {
1648 // Shift amount must be constant
1649 return false;
1650 }
1651 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1652
1653 // To avoid extra codegen and be profitable, we need Outer >= Inner
1654 if (OuterShiftAmt < InnerShiftAmt) {
1655 return false;
1656 }
1657
1658 // If the outer shift is more than the type size, we have no bitfield to
1659 // extract (since we also check that the inner shift is <= the outer shift
1660 // then this also implies that the inner shift is < the type size)
1661 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1662 return false;
1663 }
1664
1665 Start = CurDAG->getTargetConstant(Val: OuterShiftAmt - InnerShiftAmt, DL,
1666 VT: MVT::i32);
1667 Len = CurDAG->getTargetConstant(Val: Val.getValueSizeInBits() - OuterShiftAmt,
1668 DL, VT: MVT::i32);
1669
1670 if (N->getOpcode() == ISD::SRA) {
1671 // If we have a arithmetic right shift, we need to use the signed bfe
1672 // variant
1673 IsSigned = true;
1674 }
1675 } else {
1676 // No can do...
1677 return false;
1678 }
1679 } else {
1680 // No can do...
1681 return false;
1682 }
1683
1684
1685 unsigned Opc;
1686 // For the BFE operations we form here from "and" and "srl", always use the
1687 // unsigned variants.
1688 if (Val.getValueType() == MVT::i32) {
1689 if (IsSigned) {
1690 Opc = NVPTX::BFE_S32rii;
1691 } else {
1692 Opc = NVPTX::BFE_U32rii;
1693 }
1694 } else if (Val.getValueType() == MVT::i64) {
1695 if (IsSigned) {
1696 Opc = NVPTX::BFE_S64rii;
1697 } else {
1698 Opc = NVPTX::BFE_U64rii;
1699 }
1700 } else {
1701 // We cannot handle this type
1702 return false;
1703 }
1704
1705 SDValue Ops[] = {
1706 Val, Start, Len
1707 };
1708
1709 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: N->getVTList(), Ops));
1710 return true;
1711}
1712
1713// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1714bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1715 EVT VT = SDValue(N, 0).getValueType();
1716 if (VT.getScalarType() != MVT::bf16)
1717 return false;
1718
1719 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1720 if (STI->hasNativeBF16Support(Opcode: N->getOpcode()))
1721 return false;
1722
1723 const bool IsVec = VT.isVector();
1724 assert(!IsVec || VT.getVectorNumElements() == 2);
1725 SDLoc DL(N);
1726 SDValue N0 = N->getOperand(Num: 0);
1727 SDValue N1 = N->getOperand(Num: 1);
1728 SmallVector<SDValue, 3> Operands;
1729 auto GetConstant = [&](float Value) -> SDValue {
1730 // BF16 immediates must be legalized to integer register values
1731 APFloat APF(Value);
1732 bool LosesInfo;
1733 APF.convert(ToSemantics: APFloat::BFloat(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
1734 assert(!LosesInfo);
1735 if (IsVec) {
1736 auto API = APF.bitcastToAPInt();
1737 API = API.concat(NewLSB: API);
1738 auto Const = CurDAG->getTargetConstant(Val: API, DL, VT: MVT::i32);
1739 return SDValue(CurDAG->getMachineNode(Opcode: NVPTX::MOV_B32_i, dl: DL, VT, Op1: Const),
1740 0);
1741 }
1742 auto Const = CurDAG->getTargetConstantFP(Val: APF, DL, VT);
1743 return SDValue(CurDAG->getMachineNode(Opcode: NVPTX::MOV_BF16_i, dl: DL, VT, Op1: Const), 0);
1744 };
1745
1746 switch (N->getOpcode()) {
1747 case ISD::FADD:
1748 // add(a, b) -> fma(a, 1.0, b)
1749 Operands = {N0, GetConstant(1.0), N1};
1750 break;
1751 case ISD::FSUB:
1752 // sub(a, b) -> fma(b, -1.0, a)
1753 Operands = {N1, GetConstant(-1.0), N0};
1754 break;
1755 case ISD::FMUL:
1756 // mul(a, b) -> fma(a, b, -0.0)
1757 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1758 Operands = {N0, N1, GetConstant(-0.0)};
1759 break;
1760 default:
1761 llvm_unreachable("Unexpected opcode");
1762 };
1763
1764 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1765 MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, dl: DL, VT, Ops: Operands);
1766 ReplaceNode(F: N, T: FMA);
1767 return true;
1768}
1769
1770SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
1771 if (V.getOpcode() == ISD::BITCAST)
1772 V = V.getOperand(i: 0);
1773
1774 if (auto *CN = dyn_cast<ConstantSDNode>(Val&: V))
1775 return CurDAG->getTargetConstant(Val: CN->getAPIntValue(), DL: SDLoc(V),
1776 VT: V.getValueType());
1777 if (auto *CN = dyn_cast<ConstantFPSDNode>(Val&: V))
1778 return CurDAG->getTargetConstantFP(Val: CN->getValueAPF(), DL: SDLoc(V),
1779 VT: V.getValueType());
1780 return V;
1781}
1782
1783/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
1784/// inline asm expressions.
1785bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
1786 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
1787 std::vector<SDValue> &OutOps) {
1788 switch (ConstraintID) {
1789 default:
1790 return true;
1791 case InlineAsm::ConstraintCode::m: { // memory
1792 const auto [Base, Offset] = selectADDR(Addr: Op, DAG: CurDAG);
1793 OutOps.push_back(x: Base);
1794 OutOps.push_back(x: Offset);
1795 return false;
1796 }
1797 }
1798 return true;
1799}
1800
1801void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
1802 // Lower a CopyToReg with two 64-bit inputs
1803 // Dst:i128, lo:i64, hi:i64
1804 //
1805 // CopyToReg Dst, lo, hi;
1806 //
1807 // ==>
1808 //
1809 // tmp = V2I64toI128 {lo, hi};
1810 // CopyToReg Dst, tmp;
1811 SDValue Dst = N->getOperand(Num: 1);
1812 SDValue Lo = N->getOperand(Num: 2);
1813 SDValue Hi = N->getOperand(Num: 3);
1814
1815 SDLoc DL(N);
1816 SDNode *Mov =
1817 CurDAG->getMachineNode(Opcode: NVPTX::V2I64toI128, dl: DL, VT: MVT::i128, Ops: {Lo, Hi});
1818
1819 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
1820 NewOps[0] = N->getOperand(Num: 0);
1821 NewOps[1] = Dst;
1822 NewOps[2] = SDValue(Mov, 0);
1823 if (N->getNumOperands() == 5)
1824 NewOps[3] = N->getOperand(Num: 4);
1825 SDValue NewValue = CurDAG->getNode(Opcode: ISD::CopyToReg, DL, ResultTys: SmallVector<EVT>(N->values()), Ops: NewOps);
1826
1827 ReplaceNode(F: N, T: NewValue.getNode());
1828}
1829
1830void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
1831 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
1832 // Dst:i128, Src:i128
1833 //
1834 // {lo, hi} = CopyFromReg Src
1835 //
1836 // ==>
1837 //
1838 // {lo, hi} = I128toV2I64 Src
1839 //
1840 SDValue Ch = N->getOperand(Num: 0);
1841 SDValue Src = N->getOperand(Num: 1);
1842 SDValue Glue = N->getOperand(Num: 2);
1843 SDLoc DL(N);
1844
1845 // Add Glue and Ch to the operands and results to avoid break the execution
1846 // order
1847 SDNode *Mov = CurDAG->getMachineNode(
1848 Opcode: NVPTX::I128toV2I64, dl: DL,
1849 ResultTys: {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
1850 Ops: {Src, Ch, Glue});
1851
1852 ReplaceNode(F: N, T: Mov);
1853}
1854
1855bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
1856 SDLoc DL(N);
1857 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
1858 auto Scope = Scopes[N->getConstantOperandVal(Num: 2)];
1859
1860 // Singlethread fences have no inter-thread synchronization requirements.
1861 // Note: std::atomic_signal_fence lowers to singlethread LLVM IR fences;
1862 // this intentionally drops these before emitting PTX.
1863 if (Scope == NVPTX::Scope::Thread) {
1864 CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: N->getOperand(Num: 0));
1865 CurDAG->RemoveDeadNode(N);
1866 return true;
1867 }
1868
1869 unsigned int FenceOp = getFenceOp(
1870 O: NVPTX::Ordering(N->getConstantOperandVal(Num: 1)), S: Scope, T: Subtarget);
1871 SDValue Chain = N->getOperand(Num: 0);
1872 SDNode *FenceNode = CurDAG->getMachineNode(Opcode: FenceOp, dl: DL, VT: MVT::Other, Op1: Chain);
1873 ReplaceNode(F: N, T: FenceNode);
1874 return true;
1875}
1876
1877NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) {
1878 Scopes[C.getOrInsertSyncScopeID(SSN: "singlethread")] = NVPTX::Scope::Thread;
1879 Scopes[C.getOrInsertSyncScopeID(SSN: "")] = NVPTX::Scope::System;
1880 Scopes[C.getOrInsertSyncScopeID(SSN: "block")] = NVPTX::Scope::Block;
1881 Scopes[C.getOrInsertSyncScopeID(SSN: "cluster")] = NVPTX::Scope::Cluster;
1882 Scopes[C.getOrInsertSyncScopeID(SSN: "device")] = NVPTX::Scope::Device;
1883}
1884
1885NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
1886 if (Scopes.empty())
1887 llvm_unreachable("NVPTX Scopes must be initialized before calling "
1888 "NVPTXScopes::operator[]");
1889
1890 auto S = Scopes.find(Key: ID);
1891 if (S == Scopes.end()) {
1892 auto scopeName = Context->getSyncScopeName(Id: ID);
1893 assert(scopeName.has_value() && "Scope name must exist.");
1894
1895 // Build list of supported syncscopes programmatically
1896 SmallVector<StringRef> supportedScopes;
1897 for (const auto &Entry : Scopes) {
1898 if (auto name = Context->getSyncScopeName(Id: Entry.first))
1899 supportedScopes.push_back(Elt: name->empty() ? "<empty string>" : *name);
1900 }
1901
1902 reportFatalUsageError(
1903 reason: formatv(Fmt: "NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
1904 "Supported syncscopes are: {2}.",
1905 Vals&: scopeName.value(), Vals: int(ID),
1906 Vals: make_range(x: supportedScopes.begin(), y: supportedScopes.end())));
1907 }
1908 return S->second;
1909}
1910
1911bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
1912
1913#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1914 (is_s32 \
1915 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1916 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1917
1918#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1919 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1920 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1921
1922static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
1923 bool IsShared32,
1924 bool IsCacheHint,
1925 bool IsIm2Col) {
1926 if (IsIm2Col) {
1927 switch (Dim) {
1928 case 3:
1929 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
1930 IsShared32);
1931 case 4:
1932 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
1933 IsShared32);
1934 case 5:
1935 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
1936 IsShared32);
1937 default:
1938 llvm_unreachable("Invalid Dimension in im2col mode for "
1939 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1940 }
1941 } else {
1942 switch (Dim) {
1943 case 1:
1944 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
1945 IsShared32);
1946 case 2:
1947 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
1948 IsShared32);
1949 case 3:
1950 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
1951 IsShared32);
1952 case 4:
1953 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
1954 IsShared32);
1955 case 5:
1956 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
1957 IsShared32);
1958 default:
1959 llvm_unreachable("Invalid Dimension in tile mode for "
1960 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1961 }
1962 }
1963}
1964
1965void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
1966 unsigned RedOp,
1967 bool IsIm2Col) {
1968 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
1969 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
1970 // NumOperands = {Chain, IID} + {Actual intrinsic args}
1971 // = {2} + {4 + dims}
1972 size_t NumOps = N->getNumOperands();
1973 size_t NumDims = NumOps - 6;
1974 bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - 1) == 1;
1975 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
1976
1977 SDLoc DL(N);
1978 SmallVector<SDValue, 12> Ops(N->ops().slice(N: 2, M: NumArgs));
1979 Ops.push_back(Elt: getI32Imm(Imm: RedOp, DL)); // Reduction Op
1980 Ops.push_back(Elt: N->getOperand(Num: 0)); // Chain operand
1981
1982 bool IsShared32 =
1983 CurDAG->getDataLayout().getPointerSizeInBits(AS: ADDRESS_SPACE_SHARED) == 32;
1984 unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode(
1985 Dim: NumDims, IsShared32, IsCacheHint, IsIm2Col);
1986 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
1987}
1988
1989#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
1990 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
1991 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
1992
1993static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
1994 switch (IID) {
1995 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
1996 return TCGEN05_ST_OPCODE(16x64b, x1);
1997 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
1998 return TCGEN05_ST_OPCODE(16x64b, x2);
1999 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2000 return TCGEN05_ST_OPCODE(16x64b, x4);
2001 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2002 return TCGEN05_ST_OPCODE(16x64b, x8);
2003 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2004 return TCGEN05_ST_OPCODE(16x64b, x16);
2005 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2006 return TCGEN05_ST_OPCODE(16x64b, x32);
2007 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2008 return TCGEN05_ST_OPCODE(16x64b, x64);
2009 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2010 return TCGEN05_ST_OPCODE(16x64b, x128);
2011 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2012 return TCGEN05_ST_OPCODE(16x128b, x1);
2013 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2014 return TCGEN05_ST_OPCODE(16x128b, x2);
2015 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2016 return TCGEN05_ST_OPCODE(16x128b, x4);
2017 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2018 return TCGEN05_ST_OPCODE(16x128b, x8);
2019 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2020 return TCGEN05_ST_OPCODE(16x128b, x16);
2021 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2022 return TCGEN05_ST_OPCODE(16x128b, x32);
2023 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2024 return TCGEN05_ST_OPCODE(16x128b, x64);
2025 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2026 return TCGEN05_ST_OPCODE(16x256b, x1);
2027 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2028 return TCGEN05_ST_OPCODE(16x256b, x2);
2029 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2030 return TCGEN05_ST_OPCODE(16x256b, x4);
2031 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2032 return TCGEN05_ST_OPCODE(16x256b, x8);
2033 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2034 return TCGEN05_ST_OPCODE(16x256b, x16);
2035 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2036 return TCGEN05_ST_OPCODE(16x256b, x32);
2037 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2038 return TCGEN05_ST_OPCODE(16x32bx2, x1);
2039 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2040 return TCGEN05_ST_OPCODE(16x32bx2, x2);
2041 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2042 return TCGEN05_ST_OPCODE(16x32bx2, x4);
2043 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2044 return TCGEN05_ST_OPCODE(16x32bx2, x8);
2045 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2046 return TCGEN05_ST_OPCODE(16x32bx2, x16);
2047 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2048 return TCGEN05_ST_OPCODE(16x32bx2, x32);
2049 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2050 return TCGEN05_ST_OPCODE(16x32bx2, x64);
2051 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2052 return TCGEN05_ST_OPCODE(16x32bx2, x128);
2053 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2054 return TCGEN05_ST_OPCODE(32x32b, x1);
2055 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2056 return TCGEN05_ST_OPCODE(32x32b, x2);
2057 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2058 return TCGEN05_ST_OPCODE(32x32b, x4);
2059 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2060 return TCGEN05_ST_OPCODE(32x32b, x8);
2061 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2062 return TCGEN05_ST_OPCODE(32x32b, x16);
2063 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2064 return TCGEN05_ST_OPCODE(32x32b, x32);
2065 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2066 return TCGEN05_ST_OPCODE(32x32b, x64);
2067 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2068 return TCGEN05_ST_OPCODE(32x32b, x128);
2069 }
2070 llvm_unreachable("unhandled tcgen05.st lowering");
2071}
2072
2073void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
2074 if (!Subtarget->hasTcgen05InstSupport())
2075 report_fatal_error(
2076 reason: "tcgen05.st is not supported on this architecture variant");
2077
2078 SDLoc DL(N);
2079 unsigned IID = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getZExtValue();
2080
2081 SmallVector<SDValue, 128> Operands = {
2082 N->getOperand(Num: 2) // taddr
2083 };
2084
2085 if (hasOffset)
2086 Operands.push_back(Elt: CurDAG->getTargetConstant(
2087 Val: cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue(), DL,
2088 VT: MVT::i32)); // Offset
2089
2090 for (unsigned I = hasOffset ? 4 : 3; I < (N->getNumOperands() - 1); I++)
2091 Operands.push_back(Elt: N->getOperand(Num: I));
2092
2093 bool enableUnpack =
2094 cast<ConstantSDNode>(Val: N->getOperand(Num: N->getNumOperands() - 1))
2095 ->getZExtValue();
2096
2097 Operands.push_back(Elt: N->getOperand(Num: 0)); // Chain
2098 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: getTcgen05StOpcode(IID, enableUnpack),
2099 dl: DL, VTs: N->getVTList(), Ops: Operands));
2100}
2101
2102bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2103 unsigned IID = N->getConstantOperandVal(Num: 1);
2104 using TMARedTy = llvm::nvvm::TMAReductionOp;
2105 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2106 switch (IID) {
2107 default:
2108 return false;
2109 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2110 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2111 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2112 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2113 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2114 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::ADD));
2115 return true;
2116 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2117 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2119 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::ADD),
2120 /*IsIm2Col=*/true);
2121 return true;
2122 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2123 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2124 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2125 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2126 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2127 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MIN));
2128 return true;
2129 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2130 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2132 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MIN),
2133 /*IsIm2Col=*/true);
2134 return true;
2135 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2136 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2137 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2138 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2139 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2140 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MAX));
2141 return true;
2142 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2143 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2145 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MAX),
2146 /*IsIm2Col=*/true);
2147 return true;
2148 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2149 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2150 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2151 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2152 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2153 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::INC));
2154 return true;
2155 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2156 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2158 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::INC),
2159 /*IsIm2Col=*/true);
2160 return true;
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2162 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2163 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2164 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2165 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2166 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::DEC));
2167 return true;
2168 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2169 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2171 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::DEC),
2172 /*IsIm2Col=*/true);
2173 return true;
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2175 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2176 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2177 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2178 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2179 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::AND));
2180 return true;
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2184 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::AND),
2185 /*IsIm2Col=*/true);
2186 return true;
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2188 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2189 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2190 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2191 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2192 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::OR));
2193 return true;
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2195 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2197 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::OR),
2198 /*IsIm2Col=*/true);
2199 return true;
2200 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2201 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2202 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2203 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2204 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2205 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::XOR));
2206 return true;
2207 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2208 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2209 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2210 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::XOR),
2211 /*IsIm2Col=*/true);
2212 return true;
2213
2214 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2215 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2216 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2217 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2218 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2219 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2220 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2221 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2222 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2223 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2224 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2225 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2226 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2227 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2228 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2229 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2230 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2231 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2232 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2233 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2234 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2235 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2236 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2237 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2238 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2239 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2240 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2241 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2242 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2243 SelectTcgen05St(N);
2244 return true;
2245 }
2246
2247 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2248 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2249 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2250 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2251 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2252 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2253 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2254 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2255 SelectTcgen05St(N, /* hasOffset */ true);
2256 return true;
2257 }
2258 }
2259}
2260
2261void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
2262 MemSDNode *AN = cast<MemSDNode>(Val: N);
2263 SDLoc dl(N);
2264
2265 const SDValue Chain = N->getOperand(Num: 0);
2266 const auto [Base, Offset] = selectADDR(Addr: N->getOperand(Num: 1), DAG: CurDAG);
2267 SmallVector<SDValue, 5> Ops{Base, Offset};
2268 Ops.append(in_start: N->op_begin() + 2, in_end: N->op_end());
2269 Ops.append(IL: {
2270 getI32Imm(Imm: getMemOrder(N: AN), DL: dl),
2271 getI32Imm(Imm: getAtomicScope(N: AN), DL: dl),
2272 getI32Imm(Imm: getAddrSpace(N: AN), DL: dl),
2273 Chain,
2274 });
2275
2276 assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
2277 N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
2278 unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
2279 ? NVPTX::ATOM_EXCH_B128
2280 : NVPTX::ATOM_CAS_B128;
2281
2282 auto *ATOM = CurDAG->getMachineNode(Opcode, dl, VTs: N->getVTList(), Ops);
2283 CurDAG->setNodeMemRefs(N: ATOM, NewMemRefs: AN->getMemOperand());
2284
2285 ReplaceNode(F: N, T: ATOM);
2286}
2287
2288void NVPTXDAGToDAGISel::selectBR_JT(SDNode *N) {
2289 assert(Subtarget->hasBrx() &&
2290 "BR_JT should be expanded during legalization on unsupported targets");
2291
2292 SDLoc DL(N);
2293 const SDValue InChain = N->getOperand(Num: 0);
2294 const auto *JT = cast<JumpTableSDNode>(Val: N->getOperand(Num: 1));
2295 const SDValue Index = N->getOperand(Num: 2);
2296
2297 unsigned JId = JT->getIndex();
2298 MachineJumpTableInfo *MJTI = CurDAG->getMachineFunction().getJumpTableInfo();
2299 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2300
2301 SDValue IdV = getI32Imm(Imm: JId, DL);
2302
2303 // Generate BrxStart node
2304 MachineSDNode *Chain = CurDAG->getMachineNode(
2305 Opcode: NVPTX::BRX_START, dl: DL, ResultTys: {MVT::Other, MVT::Glue}, Ops: {IdV, InChain});
2306
2307 // Generate BrxItem nodes
2308 assert(!MBBs.empty());
2309 for (MachineBasicBlock *MBB : MBBs.drop_back())
2310 Chain = CurDAG->getMachineNode(
2311 Opcode: NVPTX::BRX_ITEM, dl: DL, ResultTys: {MVT::Other, MVT::Glue},
2312 Ops: {CurDAG->getBasicBlock(MBB), SDValue(Chain, 0), SDValue(Chain, 1)});
2313
2314 // Generate BrxEnd nodes
2315 MachineSDNode *BrxEnd =
2316 CurDAG->getMachineNode(Opcode: NVPTX::BRX_END, dl: DL, VT: MVT::Other,
2317 Ops: {CurDAG->getBasicBlock(MBB: MBBs.back()), Index, IdV,
2318 SDValue(Chain, 0), SDValue(Chain, 1)});
2319
2320 ReplaceNode(F: N, T: BrxEnd);
2321}
2322