1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
14#include "NVPTX.h"
15#include "NVPTXUtilities.h"
16#include "llvm/ADT/APInt.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/CodeGen/ISDOpcodes.h"
19#include "llvm/CodeGen/MachineJumpTableInfo.h"
20#include "llvm/CodeGen/SelectionDAG.h"
21#include "llvm/CodeGen/SelectionDAGNodes.h"
22#include "llvm/IR/GlobalValue.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/IntrinsicsNVPTX.h"
25#include "llvm/IR/NVVMIntrinsicUtils.h"
26#include "llvm/Support/AtomicOrdering.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/ErrorHandling.h"
29#include "llvm/Support/FormatVariadic.h"
30#include "llvm/Support/MathExtras.h"
31#include <optional>
32
33using namespace llvm;
34
35#define DEBUG_TYPE "nvptx-isel"
36#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
37
38static cl::opt<bool>
39 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(Val: true), cl::Hidden,
40 cl::desc("Enable reciprocal sqrt optimization"));
41
42// FIXME: This is a WAR to recover lost performance from #155024.
43// We still need to investigate the regression and find a more permanent
44// solution.
45static cl::opt<bool> EnableMADWide("nvptx-mad-wide-opt", cl::init(Val: false),
46 cl::Hidden,
47 cl::desc("Enable MAD wide optimization"));
48
49/// createNVPTXISelDag - This pass converts a legalized DAG into a
50/// NVPTX-specific DAG, ready for instruction scheduling.
51FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
52 llvm::CodeGenOptLevel OptLevel) {
53 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
54}
55
56NVPTXDAGToDAGISelLegacy::NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm,
57 CodeGenOptLevel OptLevel)
58 : SelectionDAGISelLegacy(
59 ID, std::make_unique<NVPTXDAGToDAGISel>(args&: tm, args&: OptLevel)) {}
60
61char NVPTXDAGToDAGISelLegacy::ID = 0;
62
63INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
64
65NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
66 CodeGenOptLevel OptLevel)
67 : SelectionDAGISel(tm, OptLevel), TM(tm) {}
68
69bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
70 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
71 Scopes = NVPTXScopes(MF.getFunction().getContext());
72 return SelectionDAGISel::runOnMachineFunction(mf&: MF);
73}
74
75NVPTX::DivPrecisionLevel
76NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
77 return Subtarget->getTargetLowering()->getDivF32Level(MF: *MF, N: *N);
78}
79
80bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
81 return Subtarget->getTargetLowering()->usePrecSqrtF32(N);
82}
83
84bool NVPTXDAGToDAGISel::useF32FTZ() const {
85 return Subtarget->getTargetLowering()->useF32FTZ(MF: *MF);
86}
87
88bool NVPTXDAGToDAGISel::allowFMA() const {
89 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
90 return TL->allowFMA(MF&: *MF, OptLevel);
91}
92
93bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
94
95bool NVPTXDAGToDAGISel::doMADWideOpt() const { return EnableMADWide; }
96
97/// Select - Select instructions not customized! Used for
98/// expanded, promoted and normal instructions.
99void NVPTXDAGToDAGISel::Select(SDNode *N) {
100
101 if (N->isMachineOpcode()) {
102 N->setNodeId(-1);
103 return; // Already selected.
104 }
105
106 switch (N->getOpcode()) {
107 case ISD::LOAD:
108 case ISD::ATOMIC_LOAD:
109 case NVPTXISD::MLoad:
110 if (tryLoad(N))
111 return;
112 break;
113 case ISD::STORE:
114 case ISD::ATOMIC_STORE:
115 if (tryStore(N))
116 return;
117 break;
118 case ISD::ATOMIC_FENCE:
119 if (tryFence(N))
120 return;
121 break;
122 case NVPTXISD::UNPACK_VECTOR:
123 tryUNPACK_VECTOR(N);
124 return;
125 case ISD::EXTRACT_VECTOR_ELT:
126 if (tryEXTRACT_VECTOR_ELEMENT(N))
127 return;
128 break;
129 case NVPTXISD::SETP_F16X2:
130 SelectSETP_F16X2(N);
131 return;
132 case NVPTXISD::SETP_BF16X2:
133 SelectSETP_BF16X2(N);
134 return;
135 case NVPTXISD::LoadV2:
136 case NVPTXISD::LoadV4:
137 case NVPTXISD::LoadV8:
138 if (tryLoadVector(N))
139 return;
140 break;
141 case NVPTXISD::LDUV2:
142 case NVPTXISD::LDUV4:
143 if (tryLDU(N))
144 return;
145 break;
146 case NVPTXISD::StoreV2:
147 case NVPTXISD::StoreV4:
148 case NVPTXISD::StoreV8:
149 if (tryStoreVector(N))
150 return;
151 break;
152 case ISD::INTRINSIC_W_CHAIN:
153 if (tryIntrinsicChain(N))
154 return;
155 break;
156 case ISD::INTRINSIC_VOID:
157 if (tryIntrinsicVoid(N))
158 return;
159 break;
160 case ISD::AND:
161 case ISD::SRA:
162 case ISD::SRL:
163 // Try to select BFE
164 if (tryBFE(N))
165 return;
166 break;
167 case ISD::ADDRSPACECAST:
168 SelectAddrSpaceCast(N);
169 return;
170 case ISD::CopyToReg: {
171 if (N->getOperand(Num: 1).getValueType() == MVT::i128) {
172 SelectV2I64toI128(N);
173 return;
174 }
175 break;
176 }
177 case ISD::CopyFromReg: {
178 if (N->getOperand(Num: 1).getValueType() == MVT::i128) {
179 SelectI128toV2I64(N);
180 return;
181 }
182 break;
183 }
184 case NVPTXISD::ATOMIC_CMP_SWAP_B128:
185 case NVPTXISD::ATOMIC_SWAP_B128:
186 selectAtomicSwap128(N);
187 return;
188 case ISD::FADD:
189 case ISD::FMUL:
190 case ISD::FSUB:
191 if (tryBF16ArithToFMA(N))
192 return;
193 break;
194 case ISD::BR_JT:
195 return selectBR_JT(N);
196 default:
197 break;
198 }
199 SelectCode(N);
200}
201
202#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
203 (enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
204 : NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
205
206static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
207 switch (IID) {
208 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
209 return TCGEN05_LD_OPCODE(16x64b, x1);
210 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
211 return TCGEN05_LD_OPCODE(16x64b, x2);
212 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
213 return TCGEN05_LD_OPCODE(16x64b, x4);
214 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
215 return TCGEN05_LD_OPCODE(16x64b, x8);
216 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
217 return TCGEN05_LD_OPCODE(16x64b, x16);
218 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
219 return TCGEN05_LD_OPCODE(16x64b, x32);
220 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
221 return TCGEN05_LD_OPCODE(16x64b, x64);
222 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
223 return TCGEN05_LD_OPCODE(16x64b, x128);
224 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
225 return TCGEN05_LD_OPCODE(16x128b, x1);
226 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
227 return TCGEN05_LD_OPCODE(16x128b, x2);
228 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
229 return TCGEN05_LD_OPCODE(16x128b, x4);
230 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
231 return TCGEN05_LD_OPCODE(16x128b, x8);
232 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
233 return TCGEN05_LD_OPCODE(16x128b, x16);
234 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
235 return TCGEN05_LD_OPCODE(16x128b, x32);
236 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
237 return TCGEN05_LD_OPCODE(16x128b, x64);
238 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
239 return TCGEN05_LD_OPCODE(16x256b, x1);
240 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
241 return TCGEN05_LD_OPCODE(16x256b, x2);
242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
243 return TCGEN05_LD_OPCODE(16x256b, x4);
244 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
245 return TCGEN05_LD_OPCODE(16x256b, x8);
246 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
247 return TCGEN05_LD_OPCODE(16x256b, x16);
248 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
249 return TCGEN05_LD_OPCODE(16x256b, x32);
250 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
251 return TCGEN05_LD_OPCODE(16x32bx2, x1);
252 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
253 return TCGEN05_LD_OPCODE(16x32bx2, x2);
254 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
255 return TCGEN05_LD_OPCODE(16x32bx2, x4);
256 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
257 return TCGEN05_LD_OPCODE(16x32bx2, x8);
258 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
259 return TCGEN05_LD_OPCODE(16x32bx2, x16);
260 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
261 return TCGEN05_LD_OPCODE(16x32bx2, x32);
262 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
263 return TCGEN05_LD_OPCODE(16x32bx2, x64);
264 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
265 return TCGEN05_LD_OPCODE(16x32bx2, x128);
266 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
267 return TCGEN05_LD_OPCODE(32x32b, x1);
268 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
269 return TCGEN05_LD_OPCODE(32x32b, x2);
270 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
271 return TCGEN05_LD_OPCODE(32x32b, x4);
272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
273 return TCGEN05_LD_OPCODE(32x32b, x8);
274 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
275 return TCGEN05_LD_OPCODE(32x32b, x16);
276 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
277 return TCGEN05_LD_OPCODE(32x32b, x32);
278 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
279 return TCGEN05_LD_OPCODE(32x32b, x64);
280 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
281 return TCGEN05_LD_OPCODE(32x32b, x128);
282 }
283 llvm_unreachable("unhandled tcgen05.ld lowering");
284}
285
286void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
287 if (!Subtarget->hasTcgen05InstSupport())
288 report_fatal_error(
289 reason: "tcgen05.ld is not supported on this architecture variant");
290
291 SDLoc DL(N);
292 unsigned IID = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getZExtValue();
293
294 if (hasOffset) {
295 bool enablePack = cast<ConstantSDNode>(Val: N->getOperand(Num: 4))->getZExtValue();
296 auto OffsetNode = CurDAG->getTargetConstant(
297 Val: cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue(), DL, VT: MVT::i32);
298 ReplaceNode(F: N, T: CurDAG->getMachineNode(
299 Opcode: getTcgen05LdOpcode(IID, enablePack), dl: DL, VTs: N->getVTList(),
300 Ops: {N->getOperand(Num: 2), OffsetNode, N->getOperand(Num: 0)}));
301 } else {
302 bool enablePack = cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue();
303 ReplaceNode(F: N, T: CurDAG->getMachineNode(
304 Opcode: getTcgen05LdOpcode(IID, enablePack), dl: DL, VTs: N->getVTList(),
305 Ops: {N->getOperand(Num: 2), N->getOperand(Num: 0)}));
306 }
307}
308
309bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
310 unsigned IID = N->getConstantOperandVal(Num: 1);
311 switch (IID) {
312 default:
313 return false;
314 case Intrinsic::nvvm_ldu_global_f:
315 case Intrinsic::nvvm_ldu_global_i:
316 case Intrinsic::nvvm_ldu_global_p:
317 return tryLDU(N);
318
319 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
320 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
321 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
322 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
323 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
324 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
325 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
326 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
327 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
329 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
330 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
331 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
332 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
333 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
334 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
335 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
336 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
337 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
338 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
339 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
340 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
341 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
342 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
343 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
344 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
345 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
346 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
347 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
348 SelectTcgen05Ld(N);
349 return true;
350 }
351
352 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
353 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
354 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
355 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
356 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
357 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
358 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
359 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
360 SelectTcgen05Ld(N, /* hasOffset */ true);
361 return true;
362 }
363 }
364}
365
366// Map ISD:CONDCODE value to appropriate CmpMode expected by
367// NVPTXInstPrinter::printCmpMode()
368SDValue NVPTXDAGToDAGISel::getPTXCmpMode(const CondCodeSDNode &CondCode) {
369 using NVPTX::PTXCmpMode::CmpMode;
370 const unsigned PTXCmpMode = [](ISD::CondCode CC) {
371 switch (CC) {
372 default:
373 llvm_unreachable("Unexpected condition code.");
374 case ISD::SETOEQ:
375 case ISD::SETEQ:
376 return CmpMode::EQ;
377 case ISD::SETOGT:
378 case ISD::SETGT:
379 return CmpMode::GT;
380 case ISD::SETOGE:
381 case ISD::SETGE:
382 return CmpMode::GE;
383 case ISD::SETOLT:
384 case ISD::SETLT:
385 return CmpMode::LT;
386 case ISD::SETOLE:
387 case ISD::SETLE:
388 return CmpMode::LE;
389 case ISD::SETONE:
390 case ISD::SETNE:
391 return CmpMode::NE;
392 case ISD::SETO:
393 return CmpMode::NUM;
394 case ISD::SETUO:
395 return CmpMode::NotANumber;
396 case ISD::SETUEQ:
397 return CmpMode::EQU;
398 case ISD::SETUGT:
399 return CmpMode::GTU;
400 case ISD::SETUGE:
401 return CmpMode::GEU;
402 case ISD::SETULT:
403 return CmpMode::LTU;
404 case ISD::SETULE:
405 return CmpMode::LEU;
406 case ISD::SETUNE:
407 return CmpMode::NEU;
408 }
409 }(CondCode.get());
410 return CurDAG->getTargetConstant(Val: PTXCmpMode, DL: SDLoc(), VT: MVT::i32);
411}
412
413bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
414 SDValue PTXCmpMode = getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)));
415 SDLoc DL(N);
416 SDNode *SetP = CurDAG->getMachineNode(
417 Opcode: NVPTX::SETP_f16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1,
418 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1), PTXCmpMode,
419 CurDAG->getTargetConstant(Val: useF32FTZ() ? 1 : 0, DL, VT: MVT::i1)});
420 ReplaceNode(F: N, T: SetP);
421 return true;
422}
423
424bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
425 SDValue PTXCmpMode = getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: 2)));
426 SDLoc DL(N);
427 SDNode *SetP = CurDAG->getMachineNode(
428 Opcode: NVPTX::SETP_bf16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1,
429 Ops: {N->getOperand(Num: 0), N->getOperand(Num: 1), PTXCmpMode,
430 CurDAG->getTargetConstant(Val: useF32FTZ() ? 1 : 0, DL, VT: MVT::i1)});
431 ReplaceNode(F: N, T: SetP);
432 return true;
433}
434
435bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
436 SDValue Vector = N->getOperand(Num: 0);
437 MVT EltVT = N->getSimpleValueType(ResNo: 0);
438
439 MachineSDNode *N2 =
440 CurDAG->getMachineNode(Opcode: NVPTX::I64toV2I32, dl: SDLoc(N), VT1: EltVT, VT2: EltVT, Ops: Vector);
441
442 ReplaceNode(F: N, T: N2);
443 return true;
444}
445
446// Find all instances of extract_vector_elt that use this v2f16 vector
447// and coalesce them into a scattering move instruction.
448bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
449 SDValue Vector = N->getOperand(Num: 0);
450
451 MVT VT = Vector.getSimpleValueType();
452 if (!(NVPTX::isPackedVectorTy(VT) && VT.getVectorNumElements() == 2))
453 return false;
454
455 unsigned Opcode;
456 if (VT.is32BitVector())
457 Opcode = NVPTX::I32toV2I16;
458 else if (VT.is64BitVector())
459 Opcode = NVPTX::I64toV2I32;
460 else
461 llvm_unreachable("Unhandled packed type");
462
463 // Find and record all uses of this vector that extract element 0 or 1.
464 SmallVector<SDNode *, 4> E0, E1;
465 for (auto *U : Vector.getNode()->users()) {
466 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
467 continue;
468 if (U->getOperand(Num: 0) != Vector)
469 continue;
470 if (const ConstantSDNode *IdxConst =
471 dyn_cast<ConstantSDNode>(Val: U->getOperand(Num: 1))) {
472 if (IdxConst->getZExtValue() == 0)
473 E0.push_back(Elt: U);
474 else if (IdxConst->getZExtValue() == 1)
475 E1.push_back(Elt: U);
476 else
477 llvm_unreachable("Invalid vector index.");
478 }
479 }
480
481 // There's no point scattering f16x2 if we only ever access one
482 // element of it.
483 if (E0.empty() || E1.empty())
484 return false;
485
486 // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1))
487 // into EltTy,EltTy Split[EltTy]x2(V)
488 MVT EltVT = VT.getVectorElementType();
489 SDNode *ScatterOp =
490 CurDAG->getMachineNode(Opcode, dl: SDLoc(N), VT1: EltVT, VT2: EltVT, Ops: Vector);
491 for (auto *Node : E0)
492 ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 0));
493 for (auto *Node : E1)
494 ReplaceUses(F: SDValue(Node, 0), T: SDValue(ScatterOp, 1));
495
496 return true;
497}
498
499static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) {
500 switch (AS) {
501 case llvm::ADDRESS_SPACE_LOCAL:
502 return NVPTX::AddressSpace::Local;
503 case llvm::ADDRESS_SPACE_GLOBAL:
504 return NVPTX::AddressSpace::Global;
505 case llvm::ADDRESS_SPACE_SHARED:
506 return NVPTX::AddressSpace::Shared;
507 case llvm::ADDRESS_SPACE_SHARED_CLUSTER:
508 return NVPTX::AddressSpace::SharedCluster;
509 case llvm::ADDRESS_SPACE_GENERIC:
510 return NVPTX::AddressSpace::Generic;
511 case llvm::ADDRESS_SPACE_PARAM:
512 return NVPTX::AddressSpace::Param;
513 case llvm::ADDRESS_SPACE_CONST:
514 return NVPTX::AddressSpace::Const;
515 default:
516 return std::nullopt;
517 }
518}
519
520NVPTX::AddressSpace NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) {
521 return convertAS(AS: N->getMemOperand()->getAddrSpace())
522 .value_or(u: NVPTX::AddressSpace::Generic);
523}
524
525NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const {
526 // No "sem" orderings for SM/PTX versions which do not support memory ordering
527 if (!Subtarget->hasMemoryOrdering())
528 return NVPTX::Ordering::NotAtomic;
529 auto Ordering = N->getMergedOrdering();
530 switch (Ordering) {
531 case AtomicOrdering::NotAtomic:
532 return NVPTX::Ordering::NotAtomic;
533 case AtomicOrdering::Unordered:
534 case AtomicOrdering::Monotonic:
535 return NVPTX::Ordering::Relaxed;
536 case AtomicOrdering::Acquire:
537 return NVPTX::Ordering::Acquire;
538 case AtomicOrdering::Release:
539 return NVPTX::Ordering::Release;
540 case AtomicOrdering::AcquireRelease:
541 return NVPTX::Ordering::AcquireRelease;
542 case AtomicOrdering::SequentiallyConsistent:
543 return NVPTX::Ordering::SequentiallyConsistent;
544 }
545 llvm_unreachable("Invalid atomic ordering");
546}
547
548NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const {
549 // No "scope" modifier for SM/PTX versions which do not support scoped atomics
550 // Functionally, these atomics are at device scope
551 if (!Subtarget->hasAtomScope())
552 return NVPTX::Scope::DefaultDevice;
553 return Scopes[N->getSyncScopeID()];
554}
555
556namespace {
557
558struct OperationOrderings {
559 NVPTX::Ordering InstructionOrdering, FenceOrdering;
560 OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
561 NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
562 : InstructionOrdering(IO), FenceOrdering(FO) {}
563};
564
565static OperationOrderings
566getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
567 AtomicOrdering Ordering = N->getSuccessOrdering();
568 auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N);
569
570 bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
571 bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
572
573 // clang-format off
574
575 // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
576 // Note: uses of Relaxed in the Atomic column of this table refer
577 // to LLVM AtomicOrdering::Monotonic.
578 //
579 // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
580 // |---------|----------|--------------------|------------|------------------------------|
581 // | No | No | All | plain | .weak |
582 // | No | Yes | Generic,Shared, | .volatile | .volatile |
583 // | | | Global [0] | | |
584 // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
585 // | Unorder | Yes/No | All | == Relaxed | == Relaxed |
586 // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
587 // | | | Global [0] | | |
588 // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
589 // | | | Global [0] | | |
590 // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
591 // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
592 // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
593 // | | | | | or .volatile (PTX 8.1-) |
594 // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
595 // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
596 // | | | / Global [0] | | |
597
598 // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
599 // by following the ABI proven sound in:
600 // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
601 // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
602 //
603 // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
604 // |------------------------------------------------------|-------------------------------|
605 // | cuda::atomic_thread_fence | fence.sc.<scope>; |
606 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
607 // |------------------------------------------------------|-------------------------------|
608 // | cuda::atomic_load | fence.sc.<scope>; |
609 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
610 // |------------------------------------------------------|-------------------------------|
611 // | cuda::atomic_store | fence.sc.<scope>; |
612 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
613 // |------------------------------------------------------|-------------------------------|
614 // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
615 // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
616
617 // clang-format on
618
619 // [0]: volatile and atomics are only supported on global or shared
620 // memory locations, accessed via generic/shared/global pointers.
621 // MMIO is only supported on global memory locations,
622 // accessed via generic/global pointers.
623 // TODO: Implement MMIO access via generic pointer to global.
624 // Currently implemented for global pointers only.
625
626 // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
627 // PTX instructions fails to preserve their C++ side-effects.
628 //
629 // Example (https://github.com/llvm/llvm-project/issues/62057):
630 //
631 // void example() {
632 // std::atomic<bool> True = true;
633 // while (True.load(std::memory_order_relaxed));
634 // }
635 //
636 // A C++ program that calls "example" is well-defined: the infinite loop
637 // performs an atomic operation. By lowering volatile/atomics to
638 // "weak" memory operations, we are transforming the above into:
639 //
640 // void undefined_behavior() {
641 // bool True = true;
642 // while (True);
643 // }
644 //
645 // which exhibits undefined behavior in both C++ and PTX.
646 //
647 // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
648 // behavior due to lack of Independent Forward Progress. Lowering these
649 // to weak memory operations in sm_60- is therefore fine.
650 //
651 // TODO: lower atomic and volatile operations to memory locations
652 // in local, const, and param to two PTX instructions in sm_70+:
653 // - the "weak" memory instruction we are currently lowering to, and
654 // - some other instruction that preserves the side-effect, e.g.,
655 // a dead dummy volatile load.
656 if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
657 CodeAddrSpace == NVPTX::AddressSpace::Const ||
658 CodeAddrSpace == NVPTX::AddressSpace::Param) {
659 return NVPTX::Ordering::NotAtomic;
660 }
661
662 // [2]: Atomics with Ordering different than Unordered or Relaxed are not
663 // supported on sm_60 and older; this includes volatile atomics.
664 if (!(Ordering == AtomicOrdering::NotAtomic ||
665 Ordering == AtomicOrdering::Unordered ||
666 Ordering == AtomicOrdering::Monotonic) &&
667 !HasMemoryOrdering) {
668 report_fatal_error(
669 reason: formatv(Fmt: "PTX does not support \"atomic\" for orderings different than"
670 "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
671 "is: \"{}\".",
672 Vals: toIRString(ao: Ordering)));
673 }
674
675 // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
676 // the volatile semantics and preserve the atomic ones.
677
678 // PTX volatile and PTX atomics are not available for statespace that differ
679 // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
680 // atomics is undefined if the generic address does not refer to a .global or
681 // .shared memory location.
682 bool AddrGenericOrGlobalOrShared =
683 (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
684 CodeAddrSpace == NVPTX::AddressSpace::Global ||
685 CodeAddrSpace == NVPTX::AddressSpace::Shared ||
686 CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
687 if (!AddrGenericOrGlobalOrShared)
688 return NVPTX::Ordering::NotAtomic;
689
690 bool UseRelaxedMMIO =
691 HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
692
693 switch (Ordering) {
694 case AtomicOrdering::NotAtomic:
695 return N->isVolatile() ? NVPTX::Ordering::Volatile
696 : NVPTX::Ordering::NotAtomic;
697 case AtomicOrdering::Unordered:
698 // We lower unordered in the exact same way as 'monotonic' to respect
699 // LLVM IR atomicity requirements.
700 case AtomicOrdering::Monotonic:
701 if (N->isVolatile())
702 return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
703 : NVPTX::Ordering::Volatile;
704 else
705 return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
706 : NVPTX::Ordering::Volatile;
707 // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
708 // Acquire.
709 case AtomicOrdering::Acquire:
710 if (!N->readMem())
711 report_fatal_error(
712 reason: formatv(Fmt: "PTX only supports Acquire Ordering on reads: {}",
713 Vals: N->getOperationName()));
714 return NVPTX::Ordering::Acquire;
715 case AtomicOrdering::Release:
716 if (!N->writeMem())
717 report_fatal_error(
718 reason: formatv(Fmt: "PTX only supports Release Ordering on writes: {}",
719 Vals: N->getOperationName()));
720 return NVPTX::Ordering::Release;
721 case AtomicOrdering::AcquireRelease: {
722 report_fatal_error(
723 reason: formatv(Fmt: "NVPTX does not support AcquireRelease Ordering on "
724 "read-modify-write "
725 "yet and PTX does not support it on loads or stores: {}",
726 Vals: N->getOperationName()));
727 }
728 case AtomicOrdering::SequentiallyConsistent: {
729 // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
730 // sequence including a "fence.sc.sco" and the memory instruction with an
731 // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
732 // whether the memory operation is a read, write, or read-modify-write.
733 //
734 // This sets the ordering of the fence to SequentiallyConsistent, and
735 // sets the corresponding ordering for the instruction.
736 NVPTX::Ordering InstrOrder;
737 if (N->readMem())
738 InstrOrder = NVPTX::Ordering::Acquire;
739 else if (N->writeMem())
740 InstrOrder = NVPTX::Ordering::Release;
741 else
742 report_fatal_error(
743 reason: formatv(Fmt: "NVPTX does not support SequentiallyConsistent Ordering on "
744 "read-modify-writes yet: {}",
745 Vals: N->getOperationName()));
746 return OperationOrderings(InstrOrder,
747 NVPTX::Ordering::SequentiallyConsistent);
748 }
749 }
750 report_fatal_error(
751 reason: formatv(Fmt: "NVPTX backend does not support AtomicOrdering \"{}\" yet.",
752 Vals: toIRString(ao: Ordering)));
753}
754
755} // namespace
756
757NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
758 NVPTX::Ordering O) const {
759 switch (O) {
760 case NVPTX::Ordering::NotAtomic:
761 case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
762 // NVPTX uses Thread scope as the scope of non-atomic operations.
763 return NVPTX::Scope::Thread;
764 case NVPTX::Ordering::RelaxedMMIO:
765 // RelaxedMMIO operations are always system scope.
766 // If a RelaxedMMIO order was generated from an atomic volatile operation
767 // with a smaller thread scope, we bump it here to system scope.
768 return NVPTX::Scope::System;
769 case NVPTX::Ordering::Relaxed:
770 case NVPTX::Ordering::Acquire:
771 case NVPTX::Ordering::Release:
772 case NVPTX::Ordering::AcquireRelease:
773 case NVPTX::Ordering::SequentiallyConsistent:
774 auto S = Scopes[N->getSyncScopeID()];
775
776 // Atomic operations must have a scope greater than thread.
777 if (S == NVPTX::Scope::Thread)
778 report_fatal_error(
779 reason: formatv(Fmt: "Atomics need scope > \"{}\".", Vals: ScopeToString(S)));
780
781 // If scope is cluster, clusters must be supported.
782 if (S == NVPTX::Scope::Cluster)
783 Subtarget->failIfClustersUnsupported(FailureMessage: "cluster scope");
784
785 // If operation is volatile, then its scope is system.
786 return N->isVolatile() ? NVPTX::Scope::System : S;
787 }
788 llvm_unreachable("unhandled ordering");
789}
790
791static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
792 NVPTX::AddressSpace CodeAddrSpace) {
793 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
794 // space.
795 return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
796 N.isInvariant();
797}
798
799static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
800 NVPTXSubtarget const *T) {
801 if (S == NVPTX::Scope::Cluster)
802 T->failIfClustersUnsupported(FailureMessage: ".cluster scope fence");
803
804 // Fall back to .acq_rel if .acquire, .release is not supported.
805 if (!T->hasSplitAcquireAndReleaseFences() &&
806 (O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release))
807 O = NVPTX::Ordering::AcquireRelease;
808
809 switch (O) {
810 case NVPTX::Ordering::Acquire:
811 switch (S) {
812 case NVPTX::Scope::System:
813 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
814 : NVPTX::INT_MEMBAR_SYS;
815 case NVPTX::Scope::Block:
816 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
817 : NVPTX::INT_MEMBAR_CTA;
818 case NVPTX::Scope::Cluster:
819 return NVPTX::atomic_thread_fence_acquire_cluster;
820 case NVPTX::Scope::Device:
821 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
822 : NVPTX::INT_MEMBAR_GL;
823 case NVPTX::Scope::Thread:
824 case NVPTX::Scope::DefaultDevice:
825 report_fatal_error(
826 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
827 Vals: ScopeToString(S)));
828 }
829 break;
830 case NVPTX::Ordering::Release:
831 switch (S) {
832 case NVPTX::Scope::System:
833 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
834 : NVPTX::INT_MEMBAR_SYS;
835 case NVPTX::Scope::Block:
836 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
837 : NVPTX::INT_MEMBAR_CTA;
838 case NVPTX::Scope::Cluster:
839 return NVPTX::atomic_thread_fence_release_cluster;
840 case NVPTX::Scope::Device:
841 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
842 : NVPTX::INT_MEMBAR_GL;
843 case NVPTX::Scope::Thread:
844 case NVPTX::Scope::DefaultDevice:
845 report_fatal_error(
846 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
847 Vals: ScopeToString(S)));
848 }
849 break;
850 case NVPTX::Ordering::AcquireRelease: {
851 switch (S) {
852 case NVPTX::Scope::System:
853 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
854 : NVPTX::INT_MEMBAR_SYS;
855 case NVPTX::Scope::Block:
856 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
857 : NVPTX::INT_MEMBAR_CTA;
858 case NVPTX::Scope::Cluster:
859 return NVPTX::atomic_thread_fence_acq_rel_cluster;
860 case NVPTX::Scope::Device:
861 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
862 : NVPTX::INT_MEMBAR_GL;
863 case NVPTX::Scope::Thread:
864 case NVPTX::Scope::DefaultDevice:
865 report_fatal_error(
866 reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
867 Vals: ScopeToString(S)));
868 }
869 break;
870 }
871 case NVPTX::Ordering::SequentiallyConsistent: {
872 switch (S) {
873 case NVPTX::Scope::System:
874 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
875 : NVPTX::INT_MEMBAR_SYS;
876 case NVPTX::Scope::Block:
877 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
878 : NVPTX::INT_MEMBAR_CTA;
879 case NVPTX::Scope::Cluster:
880 return NVPTX::atomic_thread_fence_seq_cst_cluster;
881 case NVPTX::Scope::Device:
882 return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
883 : NVPTX::INT_MEMBAR_GL;
884 case NVPTX::Scope::Thread:
885 case NVPTX::Scope::DefaultDevice:
886 report_fatal_error(reason: formatv(Fmt: "Unsupported scope \"{}\" for seq_cst fence.",
887 Vals: ScopeToString(S)));
888 }
889 break;
890 }
891 case NVPTX::Ordering::NotAtomic:
892 case NVPTX::Ordering::Relaxed:
893 case NVPTX::Ordering::Volatile:
894 case NVPTX::Ordering::RelaxedMMIO:
895 report_fatal_error(
896 reason: formatv(Fmt: "Unsupported \"{}\" ordering and \"{}\" scope for fence.",
897 Vals: OrderingToString(Order: O), Vals: ScopeToString(S)));
898 }
899 llvm_unreachable("unhandled ordering");
900}
901
902// Returns Memory Order and Scope of a memory instruction, and
903// inserts any fence before the instruction that's required to
904// implement its memory ordering.
905std::pair<NVPTX::Ordering, NVPTX::Scope>
906NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
907 MemSDNode *N) {
908 auto [InstructionOrdering, FenceOrdering] =
909 getOperationOrderings(N, Subtarget);
910 auto Scope = getOperationScope(N, O: InstructionOrdering);
911
912 // If a fence is required before the operation, insert it:
913 switch (NVPTX::Ordering(FenceOrdering)) {
914 case NVPTX::Ordering::NotAtomic:
915 break;
916 case NVPTX::Ordering::SequentiallyConsistent: {
917 auto Op = getFenceOp(O: FenceOrdering, S: Scope, T: Subtarget);
918 Chain = SDValue(CurDAG->getMachineNode(Opcode: Op, dl: DL, VT: MVT::Other, Op1: Chain), 0);
919 break;
920 }
921 default:
922 report_fatal_error(
923 reason: formatv(Fmt: "Unexpected fence ordering: \"{}\".",
924 Vals: OrderingToString(Order: NVPTX::Ordering(FenceOrdering))));
925 }
926 return {InstructionOrdering, Scope};
927}
928
929void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
930 SDValue Src = N->getOperand(Num: 0);
931 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(Val: N);
932 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
933 unsigned DstAddrSpace = CastN->getDestAddressSpace();
934 SDLoc DL(N);
935 assert(SrcAddrSpace != DstAddrSpace &&
936 "addrspacecast must be between different address spaces");
937
938 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
939 // Specific to generic
940
941 if (TM.is64Bit() && TM.getPointerSizeInBits(AS: SrcAddrSpace) == 32) {
942 SDValue CvtNone =
943 CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, VT: MVT::i32);
944 SDNode *Cvt = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u64_u32, dl: DL, VT: MVT::i64,
945 Op1: Src, Op2: CvtNone);
946 Src = SDValue(Cvt, 0);
947 }
948
949 unsigned Opc;
950 switch (SrcAddrSpace) {
951 default: report_fatal_error(reason: "Bad address space in addrspacecast");
952 case ADDRESS_SPACE_GLOBAL:
953 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
954 break;
955 case ADDRESS_SPACE_SHARED:
956 Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
957 break;
958 case ADDRESS_SPACE_SHARED_CLUSTER:
959 if (!TM.is64Bit())
960 report_fatal_error(
961 reason: "Shared cluster address space is only supported in 64-bit mode");
962 Opc = NVPTX::cvta_shared_cluster_64;
963 break;
964 case ADDRESS_SPACE_CONST:
965 Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
966 break;
967 case ADDRESS_SPACE_LOCAL:
968 Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
969 break;
970 case ADDRESS_SPACE_PARAM:
971 Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
972 break;
973 }
974 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: N->getValueType(ResNo: 0), Op1: Src));
975 return;
976 } else {
977 // Generic to specific
978 if (SrcAddrSpace != 0)
979 report_fatal_error(reason: "Cannot cast between two non-generic address spaces");
980 unsigned Opc;
981 switch (DstAddrSpace) {
982 default: report_fatal_error(reason: "Bad address space in addrspacecast");
983 case ADDRESS_SPACE_GLOBAL:
984 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
985 break;
986 case ADDRESS_SPACE_SHARED:
987 Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
988 break;
989 case ADDRESS_SPACE_SHARED_CLUSTER:
990 if (!TM.is64Bit())
991 report_fatal_error(
992 reason: "Shared cluster address space is only supported in 64-bit mode");
993 Opc = NVPTX::cvta_to_shared_cluster_64;
994 break;
995 case ADDRESS_SPACE_CONST:
996 Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
997 break;
998 case ADDRESS_SPACE_LOCAL:
999 Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
1000 break;
1001 case ADDRESS_SPACE_PARAM:
1002 Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
1003 break;
1004 }
1005
1006 SDNode *CVTA = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: N->getValueType(ResNo: 0), Op1: Src);
1007 if (TM.is64Bit() && TM.getPointerSizeInBits(AS: DstAddrSpace) == 32) {
1008 SDValue CvtNone =
1009 CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, VT: MVT::i32);
1010 CVTA = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u32_u64, dl: DL, VT: MVT::i32,
1011 Op1: SDValue(CVTA, 0), Op2: CvtNone);
1012 }
1013
1014 ReplaceNode(F: N, T: CVTA);
1015 return;
1016 }
1017}
1018
1019// Helper function template to reduce amount of boilerplate code for
1020// opcode selection.
1021static std::optional<unsigned>
1022pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i16,
1023 std::optional<unsigned> Opcode_i32,
1024 std::optional<unsigned> Opcode_i64) {
1025 switch (VT) {
1026 case MVT::f16:
1027 case MVT::i16:
1028 case MVT::bf16:
1029 return Opcode_i16;
1030 case MVT::v2f16:
1031 case MVT::v2bf16:
1032 case MVT::v2i16:
1033 case MVT::v4i8:
1034 case MVT::i32:
1035 case MVT::f32:
1036 return Opcode_i32;
1037 case MVT::v2f32:
1038 case MVT::v2i32:
1039 case MVT::i64:
1040 case MVT::f64:
1041 return Opcode_i64;
1042 default:
1043 return std::nullopt;
1044 }
1045}
1046
1047static inline bool isAddLike(const SDValue V) {
1048 return V.getOpcode() == ISD::ADD ||
1049 (V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
1050}
1051
1052static SDValue stripAssertAlign(SDValue N) {
1053 if (N.getOpcode() == ISD::AssertAlign)
1054 N = N.getOperand(i: 0);
1055 return N;
1056}
1057
1058// selectBaseADDR - Match a dag node which will serve as the base address for an
1059// ADDR operand pair.
1060static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
1061 N = stripAssertAlign(N);
1062 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N))
1063 return DAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N),
1064 VT: GA->getValueType(ResNo: 0), offset: GA->getOffset(),
1065 TargetFlags: GA->getTargetFlags());
1066 if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: N))
1067 return DAG->getTargetExternalSymbol(Sym: ES->getSymbol(), VT: ES->getValueType(ResNo: 0),
1068 TargetFlags: ES->getTargetFlags());
1069 if (const auto *FIN = dyn_cast<FrameIndexSDNode>(Val&: N))
1070 return DAG->getTargetFrameIndex(FI: FIN->getIndex(), VT: FIN->getValueType(ResNo: 0));
1071
1072 return N;
1073}
1074
1075static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
1076 Addr = stripAssertAlign(N: Addr);
1077 APInt AccumulatedOffset(64u, 0);
1078 while (isAddLike(V: Addr)) {
1079 const auto *CN = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: 1));
1080 if (!CN)
1081 break;
1082
1083 const APInt CI = CN->getAPIntValue().sext(width: 64);
1084 if (!(CI + AccumulatedOffset).isSignedIntN(N: 32))
1085 break;
1086
1087 AccumulatedOffset += CI;
1088 Addr = stripAssertAlign(N: Addr->getOperand(Num: 0));
1089 }
1090 return DAG->getSignedTargetConstant(Val: AccumulatedOffset.getSExtValue(), DL,
1091 VT: MVT::i32);
1092}
1093
1094static std::pair<SDValue, SDValue> selectADDR(SDValue Addr, SelectionDAG *DAG) {
1095 SDValue Offset = accumulateOffset(Addr, DL: SDLoc(Addr), DAG);
1096 SDValue Base = selectBaseADDR(N: Addr, DAG);
1097 return {Base, Offset};
1098}
1099
1100// Select a pair of operands which represent a valid PTX address, this could be
1101// one of the following things:
1102// - [var] - Offset is simply set to 0
1103// - [reg] - Offset is simply set to 0
1104// - [reg+immOff]
1105// - [var+immOff]
1106// Note that immOff must fit into a 32-bit signed integer.
1107bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
1108 SDValue &Offset) {
1109 std::tie(args&: Base, args&: Offset) = selectADDR(Addr, DAG: CurDAG);
1110 return true;
1111}
1112
1113bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1114 MemSDNode *LD = cast<MemSDNode>(Val: N);
1115 assert(LD->readMem() && "Expected load");
1116
1117 // do not support pre/post inc/dec
1118 const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(Val: LD);
1119 if (PlainLoad && PlainLoad->isIndexed())
1120 return false;
1121
1122 // Address Space Setting
1123 const auto CodeAddrSpace = getAddrSpace(N: LD);
1124 if (canLowerToLDG(N: *LD, Subtarget: *Subtarget, CodeAddrSpace))
1125 return tryLDG(N: LD);
1126
1127 SDLoc DL(LD);
1128 SDValue Chain = N->getOperand(Num: 0);
1129 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: LD);
1130
1131 const unsigned FromTypeWidth = LD->getMemoryVT().getSizeInBits();
1132
1133 // Vector Setting
1134 const unsigned FromType =
1135 (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1136 ? NVPTX::PTXLdStInstCode::Signed
1137 : NVPTX::PTXLdStInstCode::Untyped;
1138
1139 uint32_t UsedBytesMask;
1140 switch (N->getOpcode()) {
1141 case ISD::LOAD:
1142 case ISD::ATOMIC_LOAD:
1143 UsedBytesMask = UINT32_MAX;
1144 break;
1145 case NVPTXISD::MLoad:
1146 UsedBytesMask = N->getConstantOperandVal(Num: 3);
1147 break;
1148 default:
1149 llvm_unreachable("Unexpected opcode");
1150 }
1151
1152 assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
1153 FromTypeWidth <= 128 && "Invalid width for load");
1154
1155 // Create the machine instruction DAG
1156 const auto [Base, Offset] = selectADDR(Addr: N->getOperand(Num: 1), DAG: CurDAG);
1157 SDValue Ops[] = {getI32Imm(Imm: Ordering, DL),
1158 getI32Imm(Imm: Scope, DL),
1159 getI32Imm(Imm: CodeAddrSpace, DL),
1160 getI32Imm(Imm: FromType, DL),
1161 getI32Imm(Imm: FromTypeWidth, DL),
1162 getI32Imm(Imm: UsedBytesMask, DL),
1163 Base,
1164 Offset,
1165 Chain};
1166
1167 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy;
1168 const std::optional<unsigned> Opcode =
1169 pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LD_i16, Opcode_i32: NVPTX::LD_i32, Opcode_i64: NVPTX::LD_i64);
1170 if (!Opcode)
1171 return false;
1172
1173 SDNode *NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1174 if (!NVPTXLD)
1175 return false;
1176
1177 MachineMemOperand *MemRef = LD->getMemOperand();
1178 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef});
1179
1180 ReplaceNode(F: LD, T: NVPTXLD);
1181 return true;
1182}
1183
1184static unsigned getStoreVectorNumElts(SDNode *N) {
1185 switch (N->getOpcode()) {
1186 case NVPTXISD::StoreV2:
1187 return 2;
1188 case NVPTXISD::StoreV4:
1189 return 4;
1190 case NVPTXISD::StoreV8:
1191 return 8;
1192 default:
1193 llvm_unreachable("Unexpected opcode");
1194 }
1195}
1196
1197bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1198 MemSDNode *LD = cast<MemSDNode>(Val: N);
1199
1200 // Address Space Setting
1201 const auto CodeAddrSpace = getAddrSpace(N: LD);
1202 if (canLowerToLDG(N: *LD, Subtarget: *Subtarget, CodeAddrSpace))
1203 return tryLDG(N: LD);
1204
1205 const MVT EltVT = LD->getSimpleValueType(ResNo: 0);
1206 SDLoc DL(LD);
1207 SDValue Chain = LD->getChain();
1208 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: LD);
1209
1210 // Type Setting: fromType + fromTypeWidth
1211 //
1212 // Sign : ISD::SEXTLOAD
1213 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1214 // type is integer
1215 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1216 // Read at least 8 bits (predicates are stored as 8-bit values)
1217 // Get the original LoadSDNode::getExtensionType() value
1218 const unsigned ExtensionType = N->getConstantOperandVal(Num: 4);
1219 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1220 ? NVPTX::PTXLdStInstCode::Signed
1221 : NVPTX::PTXLdStInstCode::Untyped;
1222
1223 const unsigned FromTypeWidth = getFromTypeWidthForLoad(Mem: LD);
1224 const uint32_t UsedBytesMask = N->getConstantOperandVal(Num: 3);
1225
1226 assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1227
1228 const auto [Base, Offset] = selectADDR(Addr: N->getOperand(Num: 1), DAG: CurDAG);
1229 SDValue Ops[] = {getI32Imm(Imm: Ordering, DL),
1230 getI32Imm(Imm: Scope, DL),
1231 getI32Imm(Imm: CodeAddrSpace, DL),
1232 getI32Imm(Imm: FromType, DL),
1233 getI32Imm(Imm: FromTypeWidth, DL),
1234 getI32Imm(Imm: UsedBytesMask, DL),
1235 Base,
1236 Offset,
1237 Chain};
1238
1239 std::optional<unsigned> Opcode;
1240 switch (N->getOpcode()) {
1241 default:
1242 llvm_unreachable("Unexpected opcode");
1243 case NVPTXISD::LoadV2:
1244 Opcode = pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i16: NVPTX::LDV_i16_v2,
1245 Opcode_i32: NVPTX::LDV_i32_v2, Opcode_i64: NVPTX::LDV_i64_v2);
1246 break;
1247 case NVPTXISD::LoadV4:
1248 Opcode = pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i16: NVPTX::LDV_i16_v4,
1249 Opcode_i32: NVPTX::LDV_i32_v4, Opcode_i64: NVPTX::LDV_i64_v4);
1250 break;
1251 case NVPTXISD::LoadV8:
1252 Opcode = pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i16: {/* no v8i16 */},
1253 Opcode_i32: NVPTX::LDV_i32_v8, Opcode_i64: {/* no v8i64 */});
1254 break;
1255 }
1256 if (!Opcode)
1257 return false;
1258
1259 SDNode *NVPTXLD = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1260
1261 MachineMemOperand *MemRef = LD->getMemOperand();
1262 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef});
1263
1264 ReplaceNode(F: LD, T: NVPTXLD);
1265 return true;
1266}
1267
1268bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1269 SDLoc DL(LD);
1270
1271 unsigned ExtensionType;
1272 uint32_t UsedBytesMask;
1273 if (const auto *Load = dyn_cast<LoadSDNode>(Val: LD)) {
1274 ExtensionType = Load->getExtensionType();
1275 UsedBytesMask = UINT32_MAX;
1276 } else {
1277 ExtensionType = LD->getConstantOperandVal(Num: 4);
1278 UsedBytesMask = LD->getConstantOperandVal(Num: 3);
1279 }
1280 const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1281 ? NVPTX::PTXLdStInstCode::Signed
1282 : NVPTX::PTXLdStInstCode::Untyped;
1283
1284 const unsigned FromTypeWidth = getFromTypeWidthForLoad(Mem: LD);
1285
1286 assert(!(LD->getSimpleValueType(0).isVector() &&
1287 ExtensionType != ISD::NON_EXTLOAD));
1288
1289 const auto [Base, Offset] = selectADDR(Addr: LD->getOperand(Num: 1), DAG: CurDAG);
1290 SDValue Ops[] = {getI32Imm(Imm: FromType, DL),
1291 getI32Imm(Imm: FromTypeWidth, DL),
1292 getI32Imm(Imm: UsedBytesMask, DL),
1293 Base,
1294 Offset,
1295 LD->getChain()};
1296
1297 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy;
1298 std::optional<unsigned> Opcode;
1299 switch (LD->getOpcode()) {
1300 default:
1301 llvm_unreachable("Unexpected opcode");
1302 case ISD::LOAD:
1303 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LD_GLOBAL_NC_i16,
1304 Opcode_i32: NVPTX::LD_GLOBAL_NC_i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_i64);
1305 break;
1306 case NVPTXISD::MLoad:
1307 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: std::nullopt, Opcode_i32: NVPTX::LD_GLOBAL_NC_i32,
1308 Opcode_i64: NVPTX::LD_GLOBAL_NC_i64);
1309 break;
1310 case NVPTXISD::LoadV2:
1311 Opcode =
1312 pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LD_GLOBAL_NC_v2i16,
1313 Opcode_i32: NVPTX::LD_GLOBAL_NC_v2i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_v2i64);
1314 break;
1315 case NVPTXISD::LoadV4:
1316 Opcode =
1317 pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LD_GLOBAL_NC_v4i16,
1318 Opcode_i32: NVPTX::LD_GLOBAL_NC_v4i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_v4i64);
1319 break;
1320 case NVPTXISD::LoadV8:
1321 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: {/* no v8i16 */},
1322 Opcode_i32: NVPTX::LD_GLOBAL_NC_v8i32, Opcode_i64: {/* no v8i64 */});
1323 break;
1324 }
1325 if (!Opcode)
1326 return false;
1327
1328 SDNode *NVPTXLDG = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1329
1330 ReplaceNode(F: LD, T: NVPTXLDG);
1331 return true;
1332}
1333
1334unsigned NVPTXDAGToDAGISel::getFromTypeWidthForLoad(const MemSDNode *Mem) {
1335 auto TotalWidth = Mem->getMemoryVT().getSizeInBits();
1336 auto NumElts = Mem->getNumValues() - 1;
1337 auto ElementBitWidth = TotalWidth / NumElts;
1338 assert(isPowerOf2_32(ElementBitWidth) && ElementBitWidth >= 8 &&
1339 ElementBitWidth <= 128 && TotalWidth <= 256 &&
1340 "Invalid width for load");
1341 return ElementBitWidth;
1342}
1343
1344bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1345 auto *LD = cast<MemSDNode>(Val: N);
1346
1347 SDLoc DL(N);
1348 const unsigned FromTypeWidth = getFromTypeWidthForLoad(Mem: LD);
1349 const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: 0).SimpleTy;
1350
1351 // If this is an LDU intrinsic, the address is the third operand. If its an
1352 // LDU SD node (from custom vector handling), then its the second operand
1353 SDValue Addr =
1354 LD->getOperand(Num: LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
1355
1356 const auto [Base, Offset] = selectADDR(Addr, DAG: CurDAG);
1357 SDValue Ops[] = {getI32Imm(Imm: FromTypeWidth, DL), Base, Offset, LD->getChain()};
1358
1359 std::optional<unsigned> Opcode;
1360 switch (N->getOpcode()) {
1361 default:
1362 llvm_unreachable("Unexpected opcode");
1363 case ISD::INTRINSIC_W_CHAIN:
1364 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LDU_GLOBAL_i16,
1365 Opcode_i32: NVPTX::LDU_GLOBAL_i32, Opcode_i64: NVPTX::LDU_GLOBAL_i64);
1366 break;
1367 case NVPTXISD::LDUV2:
1368 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LDU_GLOBAL_v2i16,
1369 Opcode_i32: NVPTX::LDU_GLOBAL_v2i32, Opcode_i64: NVPTX::LDU_GLOBAL_v2i64);
1370 break;
1371 case NVPTXISD::LDUV4:
1372 Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i16: NVPTX::LDU_GLOBAL_v4i16,
1373 Opcode_i32: NVPTX::LDU_GLOBAL_v4i32, Opcode_i64: {/* no v4i64 */});
1374 break;
1375 }
1376 if (!Opcode)
1377 return false;
1378
1379 SDNode *NVPTXLDU = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1380
1381 ReplaceNode(F: LD, T: NVPTXLDU);
1382 return true;
1383}
1384
1385bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1386 MemSDNode *ST = cast<MemSDNode>(Val: N);
1387 assert(ST->writeMem() && "Expected store");
1388 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(Val: ST);
1389 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(Val: ST);
1390 assert((PlainStore || AtomicStore) && "Expected store");
1391
1392 // do not support pre/post inc/dec
1393 if (PlainStore && PlainStore->isIndexed())
1394 return false;
1395
1396 // Address Space Setting
1397 const auto CodeAddrSpace = getAddrSpace(N: ST);
1398
1399 SDLoc DL(ST);
1400 SDValue Chain = ST->getChain();
1401 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: ST);
1402
1403 // Vector Setting
1404 const unsigned ToTypeWidth = ST->getMemoryVT().getSizeInBits();
1405
1406 // Create the machine instruction DAG
1407 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1408
1409 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1410 "Invalid width for store");
1411
1412 const auto [Base, Offset] = selectADDR(Addr: ST->getBasePtr(), DAG: CurDAG);
1413 SDValue Ops[] = {selectPossiblyImm(V: Value),
1414 getI32Imm(Imm: Ordering, DL),
1415 getI32Imm(Imm: Scope, DL),
1416 getI32Imm(Imm: CodeAddrSpace, DL),
1417 getI32Imm(Imm: ToTypeWidth, DL),
1418 Base,
1419 Offset,
1420 Chain};
1421
1422 const std::optional<unsigned> Opcode =
1423 pickOpcodeForVT(VT: Value.getSimpleValueType().SimpleTy, Opcode_i16: NVPTX::ST_i16,
1424 Opcode_i32: NVPTX::ST_i32, Opcode_i64: NVPTX::ST_i64);
1425 if (!Opcode)
1426 return false;
1427
1428 SDNode *NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VT: MVT::Other, Ops);
1429
1430 if (!NVPTXST)
1431 return false;
1432
1433 MachineMemOperand *MemRef = ST->getMemOperand();
1434 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef});
1435 ReplaceNode(F: ST, T: NVPTXST);
1436 return true;
1437}
1438
1439bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1440 MemSDNode *ST = cast<MemSDNode>(Val: N);
1441 const unsigned TotalWidth = ST->getMemoryVT().getSizeInBits();
1442
1443 // Address Space Setting
1444 const auto CodeAddrSpace = getAddrSpace(N: ST);
1445 if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1446 report_fatal_error(reason: "Cannot store to pointer that points to constant "
1447 "memory space");
1448 }
1449
1450 SDLoc DL(ST);
1451 SDValue Chain = ST->getChain();
1452 const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: ST);
1453
1454 const unsigned NumElts = getStoreVectorNumElts(N: ST);
1455
1456 SmallVector<SDValue, 16> Ops;
1457 for (auto &V : ST->ops().slice(N: 1, M: NumElts))
1458 Ops.push_back(Elt: selectPossiblyImm(V));
1459 SDValue Addr = N->getOperand(Num: NumElts + 1);
1460 const unsigned ToTypeWidth = TotalWidth / NumElts;
1461
1462 assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
1463 TotalWidth <= 256 && "Invalid width for store");
1464
1465 const auto [Base, Offset] = selectADDR(Addr, DAG: CurDAG);
1466 Ops.append(IL: {getI32Imm(Imm: Ordering, DL), getI32Imm(Imm: Scope, DL),
1467 getI32Imm(Imm: CodeAddrSpace, DL), getI32Imm(Imm: ToTypeWidth, DL), Base,
1468 Offset, Chain});
1469
1470 const MVT::SimpleValueType EltVT =
1471 ST->getOperand(Num: 1).getSimpleValueType().SimpleTy;
1472 std::optional<unsigned> Opcode;
1473 switch (ST->getOpcode()) {
1474 default:
1475 return false;
1476 case NVPTXISD::StoreV2:
1477 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i16: NVPTX::STV_i16_v2, Opcode_i32: NVPTX::STV_i32_v2,
1478 Opcode_i64: NVPTX::STV_i64_v2);
1479 break;
1480 case NVPTXISD::StoreV4:
1481 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i16: NVPTX::STV_i16_v4, Opcode_i32: NVPTX::STV_i32_v4,
1482 Opcode_i64: NVPTX::STV_i64_v4);
1483 break;
1484 case NVPTXISD::StoreV8:
1485 Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i16: {/* no v8i16 */}, Opcode_i32: NVPTX::STV_i32_v8,
1486 Opcode_i64: {/* no v8i64 */});
1487 break;
1488 }
1489
1490 if (!Opcode)
1491 return false;
1492
1493 SDNode *NVPTXST = CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VT: MVT::Other, Ops);
1494
1495 MachineMemOperand *MemRef = ST->getMemOperand();
1496 CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef});
1497
1498 ReplaceNode(F: ST, T: NVPTXST);
1499 return true;
1500}
1501
1502/// SelectBFE - Look for instruction sequences that can be made more efficient
1503/// by using the 'bfe' (bit-field extract) PTX instruction
1504bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1505 SDLoc DL(N);
1506 SDValue LHS = N->getOperand(Num: 0);
1507 SDValue RHS = N->getOperand(Num: 1);
1508 SDValue Len;
1509 SDValue Start;
1510 SDValue Val;
1511 bool IsSigned = false;
1512
1513 if (N->getOpcode() == ISD::AND) {
1514 // Canonicalize the operands
1515 // We want 'and %val, %mask'
1516 if (isa<ConstantSDNode>(Val: LHS) && !isa<ConstantSDNode>(Val: RHS)) {
1517 std::swap(a&: LHS, b&: RHS);
1518 }
1519
1520 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val&: RHS);
1521 if (!Mask) {
1522 // We need a constant mask on the RHS of the AND
1523 return false;
1524 }
1525
1526 // Extract the mask bits
1527 uint64_t MaskVal = Mask->getZExtValue();
1528 if (!isMask_64(Value: MaskVal)) {
1529 // We *could* handle shifted masks here, but doing so would require an
1530 // 'and' operation to fix up the low-order bits so we would trade
1531 // shr+and for bfe+and, which has the same throughput
1532 return false;
1533 }
1534
1535 // How many bits are in our mask?
1536 int64_t NumBits = countr_one(Value: MaskVal);
1537 Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32);
1538
1539 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
1540 // We have a 'srl/and' pair, extract the effective start bit and length
1541 Val = LHS.getNode()->getOperand(Num: 0);
1542 Start = LHS.getNode()->getOperand(Num: 1);
1543 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Val&: Start);
1544 if (StartConst) {
1545 uint64_t StartVal = StartConst->getZExtValue();
1546 // How many "good" bits do we have left? "good" is defined here as bits
1547 // that exist in the original value, not shifted in.
1548 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1549 if (NumBits > GoodBits) {
1550 // Do not handle the case where bits have been shifted in. In theory
1551 // we could handle this, but the cost is likely higher than just
1552 // emitting the srl/and pair.
1553 return false;
1554 }
1555 Start = CurDAG->getTargetConstant(Val: StartVal, DL, VT: MVT::i32);
1556 } else {
1557 // Do not handle the case where the shift amount (can be zero if no srl
1558 // was found) is not constant. We could handle this case, but it would
1559 // require run-time logic that would be more expensive than just
1560 // emitting the srl/and pair.
1561 return false;
1562 }
1563 } else {
1564 // Do not handle the case where the LHS of the and is not a shift. While
1565 // it would be trivial to handle this case, it would just transform
1566 // 'and' -> 'bfe', but 'and' has higher-throughput.
1567 return false;
1568 }
1569 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
1570 if (LHS->getOpcode() == ISD::AND) {
1571 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(Val&: RHS);
1572 if (!ShiftCnst) {
1573 // Shift amount must be constant
1574 return false;
1575 }
1576
1577 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1578
1579 SDValue AndLHS = LHS->getOperand(Num: 0);
1580 SDValue AndRHS = LHS->getOperand(Num: 1);
1581
1582 // Canonicalize the AND to have the mask on the RHS
1583 if (isa<ConstantSDNode>(Val: AndLHS)) {
1584 std::swap(a&: AndLHS, b&: AndRHS);
1585 }
1586
1587 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: AndRHS);
1588 if (!MaskCnst) {
1589 // Mask must be constant
1590 return false;
1591 }
1592
1593 uint64_t MaskVal = MaskCnst->getZExtValue();
1594 uint64_t NumZeros;
1595 uint64_t NumBits;
1596 if (isMask_64(Value: MaskVal)) {
1597 NumZeros = 0;
1598 // The number of bits in the result bitfield will be the number of
1599 // trailing ones (the AND) minus the number of bits we shift off
1600 NumBits = llvm::countr_one(Value: MaskVal) - ShiftAmt;
1601 } else if (isShiftedMask_64(Value: MaskVal)) {
1602 NumZeros = llvm::countr_zero(Val: MaskVal);
1603 unsigned NumOnes = llvm::countr_one(Value: MaskVal >> NumZeros);
1604 // The number of bits in the result bitfield will be the number of
1605 // trailing zeros plus the number of set bits in the mask minus the
1606 // number of bits we shift off
1607 NumBits = NumZeros + NumOnes - ShiftAmt;
1608 } else {
1609 // This is not a mask we can handle
1610 return false;
1611 }
1612
1613 if (ShiftAmt < NumZeros) {
1614 // Handling this case would require extra logic that would make this
1615 // transformation non-profitable
1616 return false;
1617 }
1618
1619 Val = AndLHS;
1620 Start = CurDAG->getTargetConstant(Val: ShiftAmt, DL, VT: MVT::i32);
1621 Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32);
1622
1623 // If pre-shift AND includes the sign bit in the bitfield, we must use
1624 // signed BFE to replicate that bit during bitfield extraction. If the
1625 // sign bit is not part of the mask, unsigned BFE will zero out upper bits
1626 // of the result
1627 if (N->getOpcode() == ISD::SRA)
1628 IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1629 } else if (LHS->getOpcode() == ISD::SHL) {
1630 // Here, we have a pattern like:
1631 //
1632 // (sra (shl val, NN), MM)
1633 // or
1634 // (srl (shl val, NN), MM)
1635 //
1636 // If MM >= NN, we can efficiently optimize this with bfe
1637 Val = LHS->getOperand(Num: 0);
1638
1639 SDValue ShlRHS = LHS->getOperand(Num: 1);
1640 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(Val&: ShlRHS);
1641 if (!ShlCnst) {
1642 // Shift amount must be constant
1643 return false;
1644 }
1645 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1646
1647 SDValue ShrRHS = RHS;
1648 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(Val&: ShrRHS);
1649 if (!ShrCnst) {
1650 // Shift amount must be constant
1651 return false;
1652 }
1653 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1654
1655 // To avoid extra codegen and be profitable, we need Outer >= Inner
1656 if (OuterShiftAmt < InnerShiftAmt) {
1657 return false;
1658 }
1659
1660 // If the outer shift is more than the type size, we have no bitfield to
1661 // extract (since we also check that the inner shift is <= the outer shift
1662 // then this also implies that the inner shift is < the type size)
1663 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1664 return false;
1665 }
1666
1667 Start = CurDAG->getTargetConstant(Val: OuterShiftAmt - InnerShiftAmt, DL,
1668 VT: MVT::i32);
1669 Len = CurDAG->getTargetConstant(Val: Val.getValueSizeInBits() - OuterShiftAmt,
1670 DL, VT: MVT::i32);
1671
1672 if (N->getOpcode() == ISD::SRA) {
1673 // If we have a arithmetic right shift, we need to use the signed bfe
1674 // variant
1675 IsSigned = true;
1676 }
1677 } else {
1678 // No can do...
1679 return false;
1680 }
1681 } else {
1682 // No can do...
1683 return false;
1684 }
1685
1686
1687 unsigned Opc;
1688 // For the BFE operations we form here from "and" and "srl", always use the
1689 // unsigned variants.
1690 if (Val.getValueType() == MVT::i32) {
1691 if (IsSigned) {
1692 Opc = NVPTX::BFE_S32rii;
1693 } else {
1694 Opc = NVPTX::BFE_U32rii;
1695 }
1696 } else if (Val.getValueType() == MVT::i64) {
1697 if (IsSigned) {
1698 Opc = NVPTX::BFE_S64rii;
1699 } else {
1700 Opc = NVPTX::BFE_U64rii;
1701 }
1702 } else {
1703 // We cannot handle this type
1704 return false;
1705 }
1706
1707 SDValue Ops[] = {
1708 Val, Start, Len
1709 };
1710
1711 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: N->getVTList(), Ops));
1712 return true;
1713}
1714
1715// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1716bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1717 EVT VT = SDValue(N, 0).getValueType();
1718 if (VT.getScalarType() != MVT::bf16)
1719 return false;
1720
1721 const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1722 if (STI->hasNativeBF16Support(Opcode: N->getOpcode()))
1723 return false;
1724
1725 const bool IsVec = VT.isVector();
1726 assert(!IsVec || VT.getVectorNumElements() == 2);
1727 SDLoc DL(N);
1728 SDValue N0 = N->getOperand(Num: 0);
1729 SDValue N1 = N->getOperand(Num: 1);
1730 SmallVector<SDValue, 3> Operands;
1731 auto GetConstant = [&](float Value) -> SDValue {
1732 // BF16 immediates must be legalized to integer register values
1733 APFloat APF(Value);
1734 bool LosesInfo;
1735 APF.convert(ToSemantics: APFloat::BFloat(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
1736 assert(!LosesInfo);
1737 if (IsVec) {
1738 auto API = APF.bitcastToAPInt();
1739 API = API.concat(NewLSB: API);
1740 auto Const = CurDAG->getTargetConstant(Val: API, DL, VT: MVT::i32);
1741 return SDValue(CurDAG->getMachineNode(Opcode: NVPTX::MOV_B32_i, dl: DL, VT, Op1: Const),
1742 0);
1743 }
1744 auto Const = CurDAG->getTargetConstantFP(Val: APF, DL, VT);
1745 return SDValue(CurDAG->getMachineNode(Opcode: NVPTX::MOV_BF16_i, dl: DL, VT, Op1: Const), 0);
1746 };
1747
1748 switch (N->getOpcode()) {
1749 case ISD::FADD:
1750 // add(a, b) -> fma(a, 1.0, b)
1751 Operands = {N0, GetConstant(1.0), N1};
1752 break;
1753 case ISD::FSUB:
1754 // sub(a, b) -> fma(b, -1.0, a)
1755 Operands = {N1, GetConstant(-1.0), N0};
1756 break;
1757 case ISD::FMUL:
1758 // mul(a, b) -> fma(a, b, -0.0)
1759 // NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1760 Operands = {N0, N1, GetConstant(-0.0)};
1761 break;
1762 default:
1763 llvm_unreachable("Unexpected opcode");
1764 };
1765
1766 int Opcode = IsVec ? NVPTX::FMA_BF16x2rrr : NVPTX::FMA_BF16rrr;
1767 MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, dl: DL, VT, Ops: Operands);
1768 ReplaceNode(F: N, T: FMA);
1769 return true;
1770}
1771
1772SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
1773 if (V.getOpcode() == ISD::BITCAST)
1774 V = V.getOperand(i: 0);
1775
1776 if (auto *CN = dyn_cast<ConstantSDNode>(Val&: V))
1777 return CurDAG->getTargetConstant(Val: CN->getAPIntValue(), DL: SDLoc(V),
1778 VT: V.getValueType());
1779 if (auto *CN = dyn_cast<ConstantFPSDNode>(Val&: V))
1780 return CurDAG->getTargetConstantFP(Val: CN->getValueAPF(), DL: SDLoc(V),
1781 VT: V.getValueType());
1782 return V;
1783}
1784
1785/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
1786/// inline asm expressions.
1787bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
1788 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
1789 std::vector<SDValue> &OutOps) {
1790 switch (ConstraintID) {
1791 default:
1792 return true;
1793 case InlineAsm::ConstraintCode::m: { // memory
1794 const auto [Base, Offset] = selectADDR(Addr: Op, DAG: CurDAG);
1795 OutOps.push_back(x: Base);
1796 OutOps.push_back(x: Offset);
1797 return false;
1798 }
1799 }
1800 return true;
1801}
1802
1803void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
1804 // Lower a CopyToReg with two 64-bit inputs
1805 // Dst:i128, lo:i64, hi:i64
1806 //
1807 // CopyToReg Dst, lo, hi;
1808 //
1809 // ==>
1810 //
1811 // tmp = V2I64toI128 {lo, hi};
1812 // CopyToReg Dst, tmp;
1813 SDValue Dst = N->getOperand(Num: 1);
1814 SDValue Lo = N->getOperand(Num: 2);
1815 SDValue Hi = N->getOperand(Num: 3);
1816
1817 SDLoc DL(N);
1818 SDNode *Mov =
1819 CurDAG->getMachineNode(Opcode: NVPTX::V2I64toI128, dl: DL, VT: MVT::i128, Ops: {Lo, Hi});
1820
1821 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1);
1822 NewOps[0] = N->getOperand(Num: 0);
1823 NewOps[1] = Dst;
1824 NewOps[2] = SDValue(Mov, 0);
1825 if (N->getNumOperands() == 5)
1826 NewOps[3] = N->getOperand(Num: 4);
1827 SDValue NewValue = CurDAG->getNode(Opcode: ISD::CopyToReg, DL, ResultTys: SmallVector<EVT>(N->values()), Ops: NewOps);
1828
1829 ReplaceNode(F: N, T: NewValue.getNode());
1830}
1831
1832void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
1833 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs
1834 // Dst:i128, Src:i128
1835 //
1836 // {lo, hi} = CopyFromReg Src
1837 //
1838 // ==>
1839 //
1840 // {lo, hi} = I128toV2I64 Src
1841 //
1842 SDValue Ch = N->getOperand(Num: 0);
1843 SDValue Src = N->getOperand(Num: 1);
1844 SDValue Glue = N->getOperand(Num: 2);
1845 SDLoc DL(N);
1846
1847 // Add Glue and Ch to the operands and results to avoid break the execution
1848 // order
1849 SDNode *Mov = CurDAG->getMachineNode(
1850 Opcode: NVPTX::I128toV2I64, dl: DL,
1851 ResultTys: {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
1852 Ops: {Src, Ch, Glue});
1853
1854 ReplaceNode(F: N, T: Mov);
1855}
1856
1857bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
1858 SDLoc DL(N);
1859 assert(N->getOpcode() == ISD::ATOMIC_FENCE);
1860 unsigned int FenceOp =
1861 getFenceOp(O: NVPTX::Ordering(N->getConstantOperandVal(Num: 1)),
1862 S: Scopes[N->getConstantOperandVal(Num: 2)], T: Subtarget);
1863 SDValue Chain = N->getOperand(Num: 0);
1864 SDNode *FenceNode = CurDAG->getMachineNode(Opcode: FenceOp, dl: DL, VT: MVT::Other, Op1: Chain);
1865 ReplaceNode(F: N, T: FenceNode);
1866 return true;
1867}
1868
1869NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) {
1870 Scopes[C.getOrInsertSyncScopeID(SSN: "singlethread")] = NVPTX::Scope::Thread;
1871 Scopes[C.getOrInsertSyncScopeID(SSN: "")] = NVPTX::Scope::System;
1872 Scopes[C.getOrInsertSyncScopeID(SSN: "block")] = NVPTX::Scope::Block;
1873 Scopes[C.getOrInsertSyncScopeID(SSN: "cluster")] = NVPTX::Scope::Cluster;
1874 Scopes[C.getOrInsertSyncScopeID(SSN: "device")] = NVPTX::Scope::Device;
1875}
1876
1877NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
1878 if (Scopes.empty())
1879 llvm_unreachable("NVPTX Scopes must be initialized before calling "
1880 "NVPTXScopes::operator[]");
1881
1882 auto S = Scopes.find(Key: ID);
1883 if (S == Scopes.end()) {
1884 auto scopeName = Context->getSyncScopeName(Id: ID);
1885 assert(scopeName.has_value() && "Scope name must exist.");
1886
1887 // Build list of supported syncscopes programmatically
1888 SmallVector<StringRef> supportedScopes;
1889 for (const auto &Entry : Scopes) {
1890 if (auto name = Context->getSyncScopeName(Id: Entry.first))
1891 supportedScopes.push_back(Elt: name->empty() ? "<empty string>" : *name);
1892 }
1893
1894 reportFatalUsageError(
1895 reason: formatv(Fmt: "NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
1896 "Supported syncscopes are: {2}.",
1897 Vals&: scopeName.value(), Vals: int(ID),
1898 Vals: make_range(x: supportedScopes.begin(), y: supportedScopes.end())));
1899 }
1900 return S->second;
1901}
1902
1903bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
1904
1905#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
1906 (is_s32 \
1907 ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
1908 : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
1909
1910#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
1911 (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
1912 : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
1913
1914static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
1915 bool IsShared32,
1916 bool IsCacheHint,
1917 bool IsIm2Col) {
1918 if (IsIm2Col) {
1919 switch (Dim) {
1920 case 3:
1921 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
1922 IsShared32);
1923 case 4:
1924 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
1925 IsShared32);
1926 case 5:
1927 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
1928 IsShared32);
1929 default:
1930 llvm_unreachable("Invalid Dimension in im2col mode for "
1931 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1932 }
1933 } else {
1934 switch (Dim) {
1935 case 1:
1936 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
1937 IsShared32);
1938 case 2:
1939 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
1940 IsShared32);
1941 case 3:
1942 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
1943 IsShared32);
1944 case 4:
1945 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
1946 IsShared32);
1947 case 5:
1948 return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
1949 IsShared32);
1950 default:
1951 llvm_unreachable("Invalid Dimension in tile mode for "
1952 "GetCpAsyncBulkTensorS2GReductionOpcode.");
1953 }
1954 }
1955}
1956
1957void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
1958 unsigned RedOp,
1959 bool IsIm2Col) {
1960 // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
1961 // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
1962 // NumOperands = {Chain, IID} + {Actual intrinsic args}
1963 // = {2} + {4 + dims}
1964 size_t NumOps = N->getNumOperands();
1965 size_t NumDims = NumOps - 6;
1966 bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - 1) == 1;
1967 size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
1968
1969 SDLoc DL(N);
1970 SmallVector<SDValue, 12> Ops(N->ops().slice(N: 2, M: NumArgs));
1971 Ops.push_back(Elt: getI32Imm(Imm: RedOp, DL)); // Reduction Op
1972 Ops.push_back(Elt: N->getOperand(Num: 0)); // Chain operand
1973
1974 bool IsShared32 =
1975 CurDAG->getDataLayout().getPointerSizeInBits(AS: ADDRESS_SPACE_SHARED) == 32;
1976 unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode(
1977 Dim: NumDims, IsShared32, IsCacheHint, IsIm2Col);
1978 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
1979}
1980
1981#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
1982 (enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
1983 : NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
1984
1985static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
1986 switch (IID) {
1987 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
1988 return TCGEN05_ST_OPCODE(16x64b, x1);
1989 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
1990 return TCGEN05_ST_OPCODE(16x64b, x2);
1991 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
1992 return TCGEN05_ST_OPCODE(16x64b, x4);
1993 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
1994 return TCGEN05_ST_OPCODE(16x64b, x8);
1995 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
1996 return TCGEN05_ST_OPCODE(16x64b, x16);
1997 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
1998 return TCGEN05_ST_OPCODE(16x64b, x32);
1999 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2000 return TCGEN05_ST_OPCODE(16x64b, x64);
2001 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2002 return TCGEN05_ST_OPCODE(16x64b, x128);
2003 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2004 return TCGEN05_ST_OPCODE(16x128b, x1);
2005 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2006 return TCGEN05_ST_OPCODE(16x128b, x2);
2007 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2008 return TCGEN05_ST_OPCODE(16x128b, x4);
2009 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2010 return TCGEN05_ST_OPCODE(16x128b, x8);
2011 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2012 return TCGEN05_ST_OPCODE(16x128b, x16);
2013 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2014 return TCGEN05_ST_OPCODE(16x128b, x32);
2015 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2016 return TCGEN05_ST_OPCODE(16x128b, x64);
2017 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2018 return TCGEN05_ST_OPCODE(16x256b, x1);
2019 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2020 return TCGEN05_ST_OPCODE(16x256b, x2);
2021 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2022 return TCGEN05_ST_OPCODE(16x256b, x4);
2023 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2024 return TCGEN05_ST_OPCODE(16x256b, x8);
2025 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2026 return TCGEN05_ST_OPCODE(16x256b, x16);
2027 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2028 return TCGEN05_ST_OPCODE(16x256b, x32);
2029 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2030 return TCGEN05_ST_OPCODE(16x32bx2, x1);
2031 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2032 return TCGEN05_ST_OPCODE(16x32bx2, x2);
2033 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2034 return TCGEN05_ST_OPCODE(16x32bx2, x4);
2035 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2036 return TCGEN05_ST_OPCODE(16x32bx2, x8);
2037 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2038 return TCGEN05_ST_OPCODE(16x32bx2, x16);
2039 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2040 return TCGEN05_ST_OPCODE(16x32bx2, x32);
2041 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2042 return TCGEN05_ST_OPCODE(16x32bx2, x64);
2043 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2044 return TCGEN05_ST_OPCODE(16x32bx2, x128);
2045 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2046 return TCGEN05_ST_OPCODE(32x32b, x1);
2047 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2048 return TCGEN05_ST_OPCODE(32x32b, x2);
2049 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2050 return TCGEN05_ST_OPCODE(32x32b, x4);
2051 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2052 return TCGEN05_ST_OPCODE(32x32b, x8);
2053 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2054 return TCGEN05_ST_OPCODE(32x32b, x16);
2055 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2056 return TCGEN05_ST_OPCODE(32x32b, x32);
2057 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2058 return TCGEN05_ST_OPCODE(32x32b, x64);
2059 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2060 return TCGEN05_ST_OPCODE(32x32b, x128);
2061 }
2062 llvm_unreachable("unhandled tcgen05.st lowering");
2063}
2064
2065void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
2066 if (!Subtarget->hasTcgen05InstSupport())
2067 report_fatal_error(
2068 reason: "tcgen05.st is not supported on this architecture variant");
2069
2070 SDLoc DL(N);
2071 unsigned IID = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getZExtValue();
2072
2073 SmallVector<SDValue, 128> Operands = {
2074 N->getOperand(Num: 2) // taddr
2075 };
2076
2077 if (hasOffset)
2078 Operands.push_back(Elt: CurDAG->getTargetConstant(
2079 Val: cast<ConstantSDNode>(Val: N->getOperand(Num: 3))->getZExtValue(), DL,
2080 VT: MVT::i32)); // Offset
2081
2082 for (unsigned I = hasOffset ? 4 : 3; I < (N->getNumOperands() - 1); I++)
2083 Operands.push_back(Elt: N->getOperand(Num: I));
2084
2085 bool enableUnpack =
2086 cast<ConstantSDNode>(Val: N->getOperand(Num: N->getNumOperands() - 1))
2087 ->getZExtValue();
2088
2089 Operands.push_back(Elt: N->getOperand(Num: 0)); // Chain
2090 ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: getTcgen05StOpcode(IID, enableUnpack),
2091 dl: DL, VTs: N->getVTList(), Ops: Operands));
2092}
2093
2094bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2095 unsigned IID = N->getConstantOperandVal(Num: 1);
2096 using TMARedTy = llvm::nvvm::TMAReductionOp;
2097 auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2098 switch (IID) {
2099 default:
2100 return false;
2101 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2102 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2103 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2104 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2105 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2106 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::ADD));
2107 return true;
2108 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2109 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2110 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2111 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::ADD),
2112 /*IsIm2Col=*/true);
2113 return true;
2114 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2115 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2116 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2117 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2118 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2119 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MIN));
2120 return true;
2121 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2122 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2123 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2124 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MIN),
2125 /*IsIm2Col=*/true);
2126 return true;
2127 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2128 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2129 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2130 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2131 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2132 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MAX));
2133 return true;
2134 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2135 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2136 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2137 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::MAX),
2138 /*IsIm2Col=*/true);
2139 return true;
2140 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2141 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2142 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2143 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2144 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2145 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::INC));
2146 return true;
2147 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2148 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2149 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2150 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::INC),
2151 /*IsIm2Col=*/true);
2152 return true;
2153 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2154 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2155 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2156 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2157 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2158 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::DEC));
2159 return true;
2160 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2161 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2162 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2163 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::DEC),
2164 /*IsIm2Col=*/true);
2165 return true;
2166 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2167 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2168 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2169 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2170 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2171 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::AND));
2172 return true;
2173 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2174 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2175 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2176 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::AND),
2177 /*IsIm2Col=*/true);
2178 return true;
2179 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2180 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2181 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2182 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2183 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2184 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::OR));
2185 return true;
2186 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2187 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2188 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2189 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::OR),
2190 /*IsIm2Col=*/true);
2191 return true;
2192 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2193 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2194 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2195 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2196 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2197 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::XOR));
2198 return true;
2199 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2200 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2201 case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2202 SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy(TMARedTy::XOR),
2203 /*IsIm2Col=*/true);
2204 return true;
2205
2206 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2207 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2208 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2209 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2210 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2211 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2212 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2213 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2214 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2215 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2216 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2217 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2218 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2219 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2220 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2221 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2222 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2223 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2224 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2225 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2226 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2227 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2228 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2229 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2230 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2231 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2232 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2233 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2234 case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2235 SelectTcgen05St(N);
2236 return true;
2237 }
2238
2239 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2240 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2241 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2242 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2243 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2244 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2245 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2246 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2247 SelectTcgen05St(N, /* hasOffset */ true);
2248 return true;
2249 }
2250 }
2251}
2252
2253void NVPTXDAGToDAGISel::selectAtomicSwap128(SDNode *N) {
2254 MemSDNode *AN = cast<MemSDNode>(Val: N);
2255 SDLoc dl(N);
2256
2257 const SDValue Chain = N->getOperand(Num: 0);
2258 const auto [Base, Offset] = selectADDR(Addr: N->getOperand(Num: 1), DAG: CurDAG);
2259 SmallVector<SDValue, 5> Ops{Base, Offset};
2260 Ops.append(in_start: N->op_begin() + 2, in_end: N->op_end());
2261 Ops.append(IL: {
2262 getI32Imm(Imm: getMemOrder(N: AN), DL: dl),
2263 getI32Imm(Imm: getAtomicScope(N: AN), DL: dl),
2264 getI32Imm(Imm: getAddrSpace(N: AN), DL: dl),
2265 Chain,
2266 });
2267
2268 assert(N->getOpcode() == NVPTXISD::ATOMIC_CMP_SWAP_B128 ||
2269 N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128);
2270 unsigned Opcode = N->getOpcode() == NVPTXISD::ATOMIC_SWAP_B128
2271 ? NVPTX::ATOM_EXCH_B128
2272 : NVPTX::ATOM_CAS_B128;
2273
2274 auto *ATOM = CurDAG->getMachineNode(Opcode, dl, VTs: N->getVTList(), Ops);
2275 CurDAG->setNodeMemRefs(N: ATOM, NewMemRefs: AN->getMemOperand());
2276
2277 ReplaceNode(F: N, T: ATOM);
2278}
2279
2280void NVPTXDAGToDAGISel::selectBR_JT(SDNode *N) {
2281 assert(Subtarget->hasBrx() &&
2282 "BR_JT should be expanded during legalization on unsupported targets");
2283
2284 SDLoc DL(N);
2285 const SDValue InChain = N->getOperand(Num: 0);
2286 const auto *JT = cast<JumpTableSDNode>(Val: N->getOperand(Num: 1));
2287 const SDValue Index = N->getOperand(Num: 2);
2288
2289 unsigned JId = JT->getIndex();
2290 MachineJumpTableInfo *MJTI = CurDAG->getMachineFunction().getJumpTableInfo();
2291 ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2292
2293 SDValue IdV = getI32Imm(Imm: JId, DL);
2294
2295 // Generate BrxStart node
2296 MachineSDNode *Chain = CurDAG->getMachineNode(
2297 Opcode: NVPTX::BRX_START, dl: DL, ResultTys: {MVT::Other, MVT::Glue}, Ops: {IdV, InChain});
2298
2299 // Generate BrxItem nodes
2300 assert(!MBBs.empty());
2301 for (MachineBasicBlock *MBB : MBBs.drop_back())
2302 Chain = CurDAG->getMachineNode(
2303 Opcode: NVPTX::BRX_ITEM, dl: DL, ResultTys: {MVT::Other, MVT::Glue},
2304 Ops: {CurDAG->getBasicBlock(MBB), SDValue(Chain, 0), SDValue(Chain, 1)});
2305
2306 // Generate BrxEnd nodes
2307 MachineSDNode *BrxEnd =
2308 CurDAG->getMachineNode(Opcode: NVPTX::BRX_END, dl: DL, VT: MVT::Other,
2309 Ops: {CurDAG->getBasicBlock(MBB: MBBs.back()), Index, IdV,
2310 SDValue(Chain, 0), SDValue(Chain, 1)});
2311
2312 ReplaceNode(F: N, T: BrxEnd);
2313}