NVPTXISelDAGToDAG.cpp source code [llvm_projects/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp]

1	//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines an instruction selector for the NVPTX target.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "NVPTXISelDAGToDAG.h"
14	#include "NVPTX.h"
15	#include "NVPTXUtilities.h"
16	#include "llvm/ADT/APInt.h"
17	#include "llvm/Analysis/ValueTracking.h"
18	#include "llvm/CodeGen/ISDOpcodes.h"
19	#include "llvm/CodeGen/SelectionDAG.h"
20	#include "llvm/CodeGen/SelectionDAGNodes.h"
21	#include "llvm/IR/GlobalValue.h"
22	#include "llvm/IR/Instructions.h"
23	#include "llvm/IR/IntrinsicsNVPTX.h"
24	#include "llvm/IR/NVVMIntrinsicUtils.h"
25	#include "llvm/Support/AtomicOrdering.h"
26	#include "llvm/Support/CommandLine.h"
27	#include "llvm/Support/ErrorHandling.h"
28	#include "llvm/Support/FormatVariadic.h"
29	#include "llvm/Support/MathExtras.h"
30	#include <optional>
31
32	using namespace llvm;
33
34	#define DEBUG_TYPE "nvptx-isel"
35	#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
36
37	static cl::opt<bool>
38	EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(Val: true), cl::Hidden,
39	cl::desc ("Enable reciprocal sqrt optimization"));
40
41	/// createNVPTXISelDag - This pass converts a legalized DAG into a
42	/// NVPTX-specific DAG, ready for instruction scheduling.
43	FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
44	llvm::CodeGenOptLevel OptLevel) {
45	return new NVPTXDAGToDAGISelLegacy (TM, OptLevel);
46	}
47
48	NVPTXDAGToDAGISelLegacy::NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm,
49	CodeGenOptLevel OptLevel)
50	: SelectionDAGISelLegacy (
51	ID, std::make_unique<NVPTXDAGToDAGISel>(args&: tm, args&: OptLevel)) {}
52
53	char NVPTXDAGToDAGISelLegacy::ID = `0`;
54
55	INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
56
57	NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
58	CodeGenOptLevel OptLevel)
59	: SelectionDAGISel (tm, OptLevel), TM(tm) {
60	doMulWide = (OptLevel > CodeGenOptLevel::None);
61	}
62
63	bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
64	Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
65	Scopes = NVPTXScopes (MF.getFunction().getContext());
66	return SelectionDAGISel::runOnMachineFunction(mf&: MF);
67	}
68
69	NVPTX::DivPrecisionLevel
70	NVPTXDAGToDAGISel::getDivF32Level(const SDNode N) const* {
71	return Subtarget->getTargetLowering()->getDivF32Level(MF: MF, N: N);
72	}
73
74	bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode N) const* {
75	return Subtarget->getTargetLowering()->usePrecSqrtF32(MF: *MF, N);
76	}
77
78	bool NVPTXDAGToDAGISel::useF32FTZ() const {
79	return Subtarget->getTargetLowering()->useF32FTZ(MF: *MF);
80	}
81
82	bool NVPTXDAGToDAGISel::allowFMA() const {
83	const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
84	return TL->allowFMA(MF&: *MF, OptLevel);
85	}
86
87	bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
88	const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
89	return TL->allowUnsafeFPMath(MF: *MF);
90	}
91
92	bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
93
94	/// Select - Select instructions not customized! Used for
95	/// expanded, promoted and normal instructions.
96	void NVPTXDAGToDAGISel::Select(SDNode *N) {
97
98	if (N->isMachineOpcode()) {
99	N->setNodeId(-`1`);
100	return; // Already selected.
101	}
102
103	switch (N->getOpcode()) {
104	case ISD::LOAD:
105	case ISD::ATOMIC_LOAD:
106	if (tryLoad(N))
107	return;
108	break;
109	case ISD::STORE:
110	case ISD::ATOMIC_STORE:
111	if (tryStore(N))
112	return;
113	break;
114	case ISD::ATOMIC_FENCE:
115	if (tryFence(N))
116	return;
117	break;
118	case NVPTXISD::UNPACK_VECTOR:
119	tryUNPACK_VECTOR(N);
120	return;
121	case ISD::EXTRACT_VECTOR_ELT:
122	if (tryEXTRACT_VECTOR_ELEMENT(N))
123	return;
124	break;
125	case NVPTXISD::SETP_F16X2:
126	SelectSETP_F16X2(N);
127	return;
128	case NVPTXISD::SETP_BF16X2:
129	SelectSETP_BF16X2(N);
130	return;
131	case NVPTXISD::LoadV2:
132	case NVPTXISD::LoadV4:
133	case NVPTXISD::LoadV8:
134	if (tryLoadVector(N))
135	return;
136	break;
137	case NVPTXISD::LDUV2:
138	case NVPTXISD::LDUV4:
139	if (tryLDU(N))
140	return;
141	break;
142	case NVPTXISD::StoreV2:
143	case NVPTXISD::StoreV4:
144	case NVPTXISD::StoreV8:
145	if (tryStoreVector(N))
146	return;
147	break;
148	case NVPTXISD::LoadParam:
149	case NVPTXISD::LoadParamV2:
150	case NVPTXISD::LoadParamV4:
151	if (tryLoadParam(N))
152	return;
153	break;
154	case NVPTXISD::StoreParam:
155	case NVPTXISD::StoreParamV2:
156	case NVPTXISD::StoreParamV4:
157	if (tryStoreParam(N))
158	return;
159	break;
160	case ISD::INTRINSIC_W_CHAIN:
161	if (tryIntrinsicChain(N))
162	return;
163	break;
164	case ISD::INTRINSIC_VOID:
165	if (tryIntrinsicVoid(N))
166	return;
167	break;
168	case ISD::AND:
169	case ISD::SRA:
170	case ISD::SRL:
171	// Try to select BFE
172	if (tryBFE(N))
173	return;
174	break;
175	case ISD::ADDRSPACECAST:
176	SelectAddrSpaceCast(N);
177	return;
178	case ISD::CopyToReg: {
179	if (N->getOperand(Num: `1`).getValueType() == MVT::i128) {
180	SelectV2I64toI128(N);
181	return;
182	}
183	break;
184	}
185	case ISD::CopyFromReg: {
186	if (N->getOperand(Num: `1`).getValueType() == MVT::i128) {
187	SelectI128toV2I64(N);
188	return;
189	}
190	break;
191	}
192	case ISD::FADD:
193	case ISD::FMUL:
194	case ISD::FSUB:
195	if (tryBF16ArithToFMA(N))
196	return;
197	break;
198	default:
199	break;
200	}
201	SelectCode(N);
202	}
203
204	#define TCGEN05_LD_OPCODE(SHAPE, NUM) \
205	(enablePack ? NVPTX::TCGEN05_LD_##SHAPE##_##NUM##_PACK \
206	: NVPTX::TCGEN05_LD_##SHAPE##_##NUM)
207
208	static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
209	switch (IID) {
210	case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
211	return TCGEN05_LD_OPCODE(`16x64b`, x1);
212	case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
213	return TCGEN05_LD_OPCODE(`16x64b`, x2);
214	case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
215	return TCGEN05_LD_OPCODE(`16x64b`, x4);
216	case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
217	return TCGEN05_LD_OPCODE(`16x64b`, x8);
218	case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
219	return TCGEN05_LD_OPCODE(`16x64b`, x16);
220	case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
221	return TCGEN05_LD_OPCODE(`16x64b`, x32);
222	case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
223	return TCGEN05_LD_OPCODE(`16x64b`, x64);
224	case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
225	return TCGEN05_LD_OPCODE(`16x64b`, x128);
226	case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
227	return TCGEN05_LD_OPCODE(`16x128b`, x1);
228	case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
229	return TCGEN05_LD_OPCODE(`16x128b`, x2);
230	case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
231	return TCGEN05_LD_OPCODE(`16x128b`, x4);
232	case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
233	return TCGEN05_LD_OPCODE(`16x128b`, x8);
234	case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
235	return TCGEN05_LD_OPCODE(`16x128b`, x16);
236	case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
237	return TCGEN05_LD_OPCODE(`16x128b`, x32);
238	case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
239	return TCGEN05_LD_OPCODE(`16x128b`, x64);
240	case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
241	return TCGEN05_LD_OPCODE(`16x256b`, x1);
242	case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
243	return TCGEN05_LD_OPCODE(`16x256b`, x2);
244	case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
245	return TCGEN05_LD_OPCODE(`16x256b`, x4);
246	case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
247	return TCGEN05_LD_OPCODE(`16x256b`, x8);
248	case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
249	return TCGEN05_LD_OPCODE(`16x256b`, x16);
250	case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
251	return TCGEN05_LD_OPCODE(`16x256b`, x32);
252	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
253	return TCGEN05_LD_OPCODE(`16x32bx2`, x1);
254	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
255	return TCGEN05_LD_OPCODE(`16x32bx2`, x2);
256	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
257	return TCGEN05_LD_OPCODE(`16x32bx2`, x4);
258	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
259	return TCGEN05_LD_OPCODE(`16x32bx2`, x8);
260	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
261	return TCGEN05_LD_OPCODE(`16x32bx2`, x16);
262	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
263	return TCGEN05_LD_OPCODE(`16x32bx2`, x32);
264	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
265	return TCGEN05_LD_OPCODE(`16x32bx2`, x64);
266	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
267	return TCGEN05_LD_OPCODE(`16x32bx2`, x128);
268	case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
269	return TCGEN05_LD_OPCODE(`32x32b`, x1);
270	case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
271	return TCGEN05_LD_OPCODE(`32x32b`, x2);
272	case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
273	return TCGEN05_LD_OPCODE(`32x32b`, x4);
274	case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
275	return TCGEN05_LD_OPCODE(`32x32b`, x8);
276	case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
277	return TCGEN05_LD_OPCODE(`32x32b`, x16);
278	case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
279	return TCGEN05_LD_OPCODE(`32x32b`, x32);
280	case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
281	return TCGEN05_LD_OPCODE(`32x32b`, x64);
282	case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
283	return TCGEN05_LD_OPCODE(`32x32b`, x128);
284	}
285	llvm_unreachable("unhandled tcgen05.ld lowering");
286	}
287
288	void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode N, bool* hasOffset) {
289	SDLoc DL(N);
290	unsigned IID = cast<ConstantSDNode>(Val: N->getOperand(Num: `1`))->getZExtValue();
291
292	if (hasOffset) {
293	bool enablePack = cast<ConstantSDNode>(Val: N->getOperand(Num: `4`))->getZExtValue();
294	auto OffsetNode = CurDAG->getTargetConstant(
295	Val: cast<ConstantSDNode>(Val: N->getOperand(Num: `3`))->getZExtValue(), DL, VT: MVT::i32);
296	ReplaceNode(F: N, T: CurDAG->getMachineNode(
297	Opcode: getTcgen05LdOpcode(IID, enablePack), dl: DL, VTs: N->getVTList(),
298	Ops: {N->getOperand(Num: `2`), OffsetNode, N->getOperand(Num: `0`)}));
299	} else {
300	bool enablePack = cast<ConstantSDNode>(Val: N->getOperand(Num: `3`))->getZExtValue();
301	ReplaceNode(F: N, T: CurDAG->getMachineNode(
302	Opcode: getTcgen05LdOpcode(IID, enablePack), dl: DL, VTs: N->getVTList(),
303	Ops: {N->getOperand(Num: `2`), N->getOperand(Num: `0`)}));
304	}
305	}
306
307	bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
308	unsigned IID = N->getConstantOperandVal(Num: `1`);
309	switch (IID) {
310	default:
311	return false;
312	case Intrinsic::nvvm_ldu_global_f:
313	case Intrinsic::nvvm_ldu_global_i:
314	case Intrinsic::nvvm_ldu_global_p:
315	return tryLDU(N);
316
317	case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
318	case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
319	case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
320	case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
321	case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
322	case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
323	case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
324	case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
325	case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
326	case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
327	case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
328	case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
329	case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
330	case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
331	case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
332	case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
333	case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
334	case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
335	case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
336	case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
337	case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
338	case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
339	case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
340	case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
341	case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
342	case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
343	case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
344	case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
345	case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: {
346	SelectTcgen05Ld(N);
347	return true;
348	}
349
350	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1:
351	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
352	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
353	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
354	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
355	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
356	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
357	case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: {
358	SelectTcgen05Ld(N, / hasOffset / true);
359	return true;
360	}
361	}
362	}
363
364	// Map ISD:CONDCODE value to appropriate CmpMode expected by
365	// NVPTXInstPrinter::printCmpMode()
366	static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
367	using NVPTX::PTXCmpMode::CmpMode;
368	unsigned PTXCmpMode = [](ISD::CondCode CC) {
369	switch (CC) {
370	default:
371	llvm_unreachable("Unexpected condition code.");
372	case ISD::SETOEQ:
373	return CmpMode::EQ;
374	case ISD::SETOGT:
375	return CmpMode::GT;
376	case ISD::SETOGE:
377	return CmpMode::GE;
378	case ISD::SETOLT:
379	return CmpMode::LT;
380	case ISD::SETOLE:
381	return CmpMode::LE;
382	case ISD::SETONE:
383	return CmpMode::NE;
384	case ISD::SETO:
385	return CmpMode::NUM;
386	case ISD::SETUO:
387	return CmpMode::NotANumber;
388	case ISD::SETUEQ:
389	return CmpMode::EQU;
390	case ISD::SETUGT:
391	return CmpMode::GTU;
392	case ISD::SETUGE:
393	return CmpMode::GEU;
394	case ISD::SETULT:
395	return CmpMode::LTU;
396	case ISD::SETULE:
397	return CmpMode::LEU;
398	case ISD::SETUNE:
399	return CmpMode::NEU;
400	case ISD::SETEQ:
401	return CmpMode::EQ;
402	case ISD::SETGT:
403	return CmpMode::GT;
404	case ISD::SETGE:
405	return CmpMode::GE;
406	case ISD::SETLT:
407	return CmpMode::LT;
408	case ISD::SETLE:
409	return CmpMode::LE;
410	case ISD::SETNE:
411	return CmpMode::NE;
412	}
413	}(CondCode.get());
414
415	if (FTZ)
416	PTXCmpMode \|= NVPTX::PTXCmpMode::FTZ_FLAG;
417
418	return PTXCmpMode;
419	}
420
421	bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
422	unsigned PTXCmpMode =
423	getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`)), FTZ: useF32FTZ());
424	SDLoc DL(N);
425	SDNode *SetP = CurDAG->getMachineNode(
426	Opcode: NVPTX::SETP_f16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1, Op1: N->getOperand(Num: `0`),
427	Op2: N->getOperand(Num: `1`), Op3: CurDAG->getTargetConstant(Val: PTXCmpMode, DL, VT: MVT::i32));
428	ReplaceNode(F: N, T: SetP);
429	return true;
430	}
431
432	bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
433	unsigned PTXCmpMode =
434	getPTXCmpMode(CondCode: *cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`)), FTZ: useF32FTZ());
435	SDLoc DL(N);
436	SDNode *SetP = CurDAG->getMachineNode(
437	Opcode: NVPTX::SETP_bf16x2rr, dl: DL, VT1: MVT::i1, VT2: MVT::i1, Op1: N->getOperand(Num: `0`),
438	Op2: N->getOperand(Num: `1`), Op3: CurDAG->getTargetConstant(Val: PTXCmpMode, DL, VT: MVT::i32));
439	ReplaceNode(F: N, T: SetP);
440	return true;
441	}
442
443	bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
444	SDValue Vector = N->getOperand(Num: `0`);
445	MVT EltVT = N->getSimpleValueType(ResNo: `0`);
446
447	MachineSDNode *N2 =
448	CurDAG->getMachineNode(Opcode: NVPTX::I64toV2I32, dl: SDLoc (N), VT1: EltVT, VT2: EltVT, Ops: Vector);
449
450	ReplaceNode(F: N, T: N2);
451	return true;
452	}
453
454	// Find all instances of extract_vector_elt that use this v2f16 vector
455	// and coalesce them into a scattering move instruction.
456	bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
457	SDValue Vector = N->getOperand(Num: `0`);
458
459	// We only care about 16x2 as it's the only real vector type we
460	// need to deal with.
461	MVT VT = Vector.getSimpleValueType();
462	if (!Isv2x16VT(VT))
463	return false;
464	// Find and record all uses of this vector that extract element 0 or 1.
465	SmallVector<SDNode *, `4`> E0, E1;
466	for (auto *U : Vector.getNode()->users()) {
467	if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
468	continue;
469	if (U->getOperand(Num: `0`) != Vector)
470	continue;
471	if (const ConstantSDNode *IdxConst =
472	dyn_cast<ConstantSDNode>(Val: U->getOperand(Num: `1`))) {
473	if (IdxConst->getZExtValue() == `0`)
474	E0.push_back(Elt: U);
475	else if (IdxConst->getZExtValue() == `1`)
476	E1.push_back(Elt: U);
477	else
478	llvm_unreachable("Invalid vector index.");
479	}
480	}
481
482	// There's no point scattering f16x2 if we only ever access one
483	// element of it.
484	if (E0.empty() \|\| E1.empty())
485	return false;
486
487	// Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
488	// into f16,f16 SplitF16x2(V)
489	MVT EltVT = VT.getVectorElementType();
490	SDNode *ScatterOp =
491	CurDAG->getMachineNode(Opcode: NVPTX::I32toV2I16, dl: SDLoc (N), VT1: EltVT, VT2: EltVT, Ops: Vector);
492	for (auto *Node : E0)
493	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (ScatterOp, `0`));
494	for (auto *Node : E1)
495	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (ScatterOp, `1`));
496
497	return true;
498	}
499
500	static std::optional<unsigned> convertAS(unsigned AS) {
501	switch (AS) {
502	case llvm::ADDRESS_SPACE_LOCAL:
503	return NVPTX::AddressSpace::Local;
504	case llvm::ADDRESS_SPACE_GLOBAL:
505	return NVPTX::AddressSpace::Global;
506	case llvm::ADDRESS_SPACE_SHARED:
507	return NVPTX::AddressSpace::Shared;
508	case llvm::ADDRESS_SPACE_SHARED_CLUSTER:
509	return NVPTX::AddressSpace::SharedCluster;
510	case llvm::ADDRESS_SPACE_GENERIC:
511	return NVPTX::AddressSpace::Generic;
512	case llvm::ADDRESS_SPACE_PARAM:
513	return NVPTX::AddressSpace::Param;
514	case llvm::ADDRESS_SPACE_CONST:
515	return NVPTX::AddressSpace::Const;
516	default:
517	return std::nullopt;
518	}
519	}
520
521	static unsigned int getCodeAddrSpace(const MemSDNode *N) {
522	return convertAS(AS: N->getMemOperand()->getAddrSpace())
523	.value_or(u: NVPTX::AddressSpace::Generic);
524	}
525
526	namespace {
527
528	struct OperationOrderings {
529	NVPTX::Ordering InstructionOrdering, FenceOrdering;
530	OperationOrderings(NVPTX::Ordering IO = NVPTX::Ordering::NotAtomic,
531	NVPTX::Ordering FO = NVPTX::Ordering::NotAtomic)
532	: InstructionOrdering(IO), FenceOrdering(FO) {}
533	};
534
535	static OperationOrderings
536	getOperationOrderings(MemSDNode N, const* NVPTXSubtarget *Subtarget) {
537	AtomicOrdering Ordering = N->getSuccessOrdering();
538	auto CodeAddrSpace = getCodeAddrSpace(N);
539
540	bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
541	bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
542
543	// clang-format off
544
545	// Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
546	// Note: uses of Relaxed in the Atomic column of this table refer
547	// to LLVM AtomicOrdering::Monotonic.
548	//
549	// \| Atomic \| Volatile \| Statespace \| PTX sm_60- \| PTX sm_70+ \|
550	// \|---------\|----------\|--------------------\|------------\|------------------------------\|
551	// \| No \| No \| All \| plain \| .weak \|
552	// \| No \| Yes \| Generic,Shared, \| .volatile \| .volatile \|
553	// \| \| \| Global [0] \| \| \|
554	// \| No \| Yes \| Local,Const,Param \| plain [1] \| .weak [1] \|
555	// \| Unorder \| Yes/No \| All \| == Relaxed \| == Relaxed \|
556	// \| Relaxed \| No \| Generic,Shared, \| .volatile \| <atomic sem> \|
557	// \| \| \| Global [0] \| \| \|
558	// \| Other \| No \| Generic,Shared, \| Error [2] \| <atomic sem> \|
559	// \| \| \| Global [0] \| \| \|
560	// \| Yes \| No \| Local,Const,Param \| plain [1] \| .weak [1] \|
561	// \| Relaxed \| Yes \| Generic,Shared [0] \| .volatile \| .volatile \|
562	// \| Relaxed \| Yes \| Global [0] \| .volatile \| .mmio.relaxed.sys (PTX 8.2+) \|
563	// \| \| \| \| \| or .volatile (PTX 8.1-) \|
564	// \| Relaxed \| Yes \| Local,Const,Param \| plain [1] \| .weak [1] \|
565	// \| Other \| Yes \| Generic, Shared, \| Error [2] \| <atomic sem> [3] \|
566	// \| \| \| / Global [0] \| \| \|
567
568	// Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
569	// by following the ABI proven sound in:
570	// Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
571	// https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
572	//
573	// \| CUDA C++ Atomic Operation or Atomic Fence \| PTX Atomic Operation or Fence \|
574	// \|------------------------------------------------------\|-------------------------------\|
575	// \| cuda::atomic_thread_fence \| fence.sc.<scope>; \|
576	// \| (memory_order_seq_cst, cuda::thread_scope_<scope>) \| \|
577	// \|------------------------------------------------------\|-------------------------------\|
578	// \| cuda::atomic_load \| fence.sc.<scope>; \|
579	// \| (memory_order_seq_cst, cuda::thread_scope_<scope>) \| ld.acquire.<scope>; \|
580	// \|------------------------------------------------------\|-------------------------------\|
581	// \| cuda::atomic_store \| fence.sc.<scope>; \|
582	// \| (memory_order_seq_cst, cuda::thread_scope_<scope>) \| st.release.<scope>; \|
583	// \|------------------------------------------------------\|-------------------------------\|
584	// \| cuda::atomic_fetch_<op> \| fence.sc.<scope>; \|
585	// \| (memory_order_seq_cst, cuda::thread_scope_<scope>) \| atom.acq_rel.<scope>; \|
586
587	// clang-format on
588
589	// [0]: volatile and atomics are only supported on global or shared
590	// memory locations, accessed via generic/shared/global pointers.
591	// MMIO is only supported on global memory locations,
592	// accessed via generic/global pointers.
593	// TODO: Implement MMIO access via generic pointer to global.
594	// Currently implemented for global pointers only.
595
596	// [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
597	// PTX instructions fails to preserve their C++ side-effects.
598	//
599	// Example (https://github.com/llvm/llvm-project/issues/62057):
600	//
601	// void example() {
602	// std::atomic<bool> True = true;
603	// while (True.load(std::memory_order_relaxed));
604	// }
605	//
606	// A C++ program that calls "example" is well-defined: the infinite loop
607	// performs an atomic operation. By lowering volatile/atomics to
608	// "weak" memory operations, we are transforming the above into:
609	//
610	// void undefined_behavior() {
611	// bool True = true;
612	// while (True);
613	// }
614	//
615	// which exhibits undefined behavior in both C++ and PTX.
616	//
617	// Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
618	// behavior due to lack of Independent Forward Progress. Lowering these
619	// to weak memory operations in sm_60- is therefore fine.
620	//
621	// TODO: lower atomic and volatile operations to memory locations
622	// in local, const, and param to two PTX instructions in sm_70+:
623	// - the "weak" memory instruction we are currently lowering to, and
624	// - some other instruction that preserves the side-effect, e.g.,
625	// a dead dummy volatile load.
626	if (CodeAddrSpace == NVPTX::AddressSpace::Local \|\|
627	CodeAddrSpace == NVPTX::AddressSpace::Const \|\|
628	CodeAddrSpace == NVPTX::AddressSpace::Param) {
629	return NVPTX::Ordering::NotAtomic;
630	}
631
632	// [2]: Atomics with Ordering different than Unordered or Relaxed are not
633	// supported on sm_60 and older; this includes volatile atomics.
634	if (!(Ordering == AtomicOrdering::NotAtomic \|\|
635	Ordering == AtomicOrdering::Unordered \|\|
636	Ordering == AtomicOrdering::Monotonic) &&
637	!HasMemoryOrdering) {
638	report_fatal_error(
639	reason: formatv(Fmt: "PTX does not support \"atomic\" for orderings different than"
640	"\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order "
641	"is: \"{}\".",
642	Vals: toIRString(ao: Ordering)));
643	}
644
645	// [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
646	// the volatile semantics and preserve the atomic ones.
647
648	// PTX volatile and PTX atomics are not available for statespace that differ
649	// from .generic, .global, or .shared. The behavior of PTX volatile and PTX
650	// atomics is undefined if the generic address does not refer to a .global or
651	// .shared memory location.
652	bool AddrGenericOrGlobalOrShared =
653	(CodeAddrSpace == NVPTX::AddressSpace::Generic \|\|
654	CodeAddrSpace == NVPTX::AddressSpace::Global \|\|
655	CodeAddrSpace == NVPTX::AddressSpace::Shared \|\|
656	CodeAddrSpace == NVPTX::AddressSpace::SharedCluster);
657	if (!AddrGenericOrGlobalOrShared)
658	return NVPTX::Ordering::NotAtomic;
659
660	bool UseRelaxedMMIO =
661	HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
662
663	switch (Ordering) {
664	case AtomicOrdering::NotAtomic:
665	return N->isVolatile() ? NVPTX::Ordering::Volatile
666	: NVPTX::Ordering::NotAtomic;
667	case AtomicOrdering::Unordered:
668	// We lower unordered in the exact same way as 'monotonic' to respect
669	// LLVM IR atomicity requirements.
670	case AtomicOrdering::Monotonic:
671	if (N->isVolatile())
672	return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
673	: NVPTX::Ordering::Volatile;
674	else
675	return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
676	: NVPTX::Ordering::Volatile;
677	// case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
678	// Acquire.
679	case AtomicOrdering::Acquire:
680	if (!N->readMem())
681	report_fatal_error(
682	reason: formatv(Fmt: "PTX only supports Acquire Ordering on reads: {}",
683	Vals: N->getOperationName()));
684	return NVPTX::Ordering::Acquire;
685	case AtomicOrdering::Release:
686	if (!N->writeMem())
687	report_fatal_error(
688	reason: formatv(Fmt: "PTX only supports Release Ordering on writes: {}",
689	Vals: N->getOperationName()));
690	return NVPTX::Ordering::Release;
691	case AtomicOrdering::AcquireRelease: {
692	report_fatal_error(
693	reason: formatv(Fmt: "NVPTX does not support AcquireRelease Ordering on "
694	"read-modify-write "
695	"yet and PTX does not support it on loads or stores: {}",
696	Vals: N->getOperationName()));
697	}
698	case AtomicOrdering::SequentiallyConsistent: {
699	// LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
700	// sequence including a "fence.sc.sco" and the memory instruction with an
701	// Ordering that differs from "sc": acq, rel, or acq_rel, depending on
702	// whether the memory operation is a read, write, or read-modify-write.
703	//
704	// This sets the ordering of the fence to SequentiallyConsistent, and
705	// sets the corresponding ordering for the instruction.
706	NVPTX::Ordering InstrOrder;
707	if (N->readMem())
708	InstrOrder = NVPTX::Ordering::Acquire;
709	else if (N->writeMem())
710	InstrOrder = NVPTX::Ordering::Release;
711	else
712	report_fatal_error(
713	reason: formatv(Fmt: "NVPTX does not support SequentiallyConsistent Ordering on "
714	"read-modify-writes yet: {}",
715	Vals: N->getOperationName()));
716	return OperationOrderings (InstrOrder,
717	NVPTX::Ordering::SequentiallyConsistent);
718	}
719	}
720	report_fatal_error(
721	reason: formatv(Fmt: "NVPTX backend does not support AtomicOrdering \"{}\" yet.",
722	Vals: toIRString(ao: Ordering)));
723	}
724
725	} // namespace
726
727	NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
728	NVPTX::Ordering O) const {
729	switch (O) {
730	case NVPTX::Ordering::NotAtomic:
731	case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
732	// NVPTX uses Thread scope as the scope of non-atomic operations.
733	return NVPTX::Scope::Thread;
734	case NVPTX::Ordering::RelaxedMMIO:
735	// RelaxedMMIO operations are always system scope.
736	// If a RelaxedMMIO order was generated from an atomic volatile operation
737	// with a smaller thread scope, we bump it here to system scope.
738	return NVPTX::Scope::System;
739	case NVPTX::Ordering::Relaxed:
740	case NVPTX::Ordering::Acquire:
741	case NVPTX::Ordering::Release:
742	case NVPTX::Ordering::AcquireRelease:
743	case NVPTX::Ordering::SequentiallyConsistent:
744	auto S = Scopes [N->getSyncScopeID()];
745
746	// Atomic operations must have a scope greater than thread.
747	if (S == NVPTX::Scope::Thread)
748	report_fatal_error(
749	reason: formatv(Fmt: "Atomics need scope > \"{}\".", Vals: ScopeToString(S)));
750
751	// If scope is cluster, clusters must be supported.
752	if (S == NVPTX::Scope::Cluster)
753	Subtarget->failIfClustersUnsupported(FailureMessage: "cluster scope");
754
755	// If operation is volatile, then its scope is system.
756	return N->isVolatile() ? NVPTX::Scope::System : S;
757	}
758	llvm_unreachable("unhandled ordering");
759	}
760
761	static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
762	unsigned CodeAddrSpace) {
763	// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
764	// space.
765	return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
766	N.isInvariant();
767	}
768
769	static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
770	NVPTXSubtarget const *T) {
771	if (S == NVPTX::Scope::Cluster)
772	T->failIfClustersUnsupported(FailureMessage: ".cluster scope fence");
773
774	// Fall back to .acq_rel if .acquire, .release is not supported.
775	if (!T->hasSplitAcquireAndReleaseFences() &&
776	(O == NVPTX::Ordering::Acquire \|\| O == NVPTX::Ordering::Release))
777	O = NVPTX::Ordering::AcquireRelease;
778
779	switch (O) {
780	case NVPTX::Ordering::Acquire:
781	switch (S) {
782	case NVPTX::Scope::System:
783	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
784	: NVPTX::INT_MEMBAR_SYS;
785	case NVPTX::Scope::Block:
786	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
787	: NVPTX::INT_MEMBAR_CTA;
788	case NVPTX::Scope::Cluster:
789	return NVPTX::atomic_thread_fence_acquire_cluster;
790	case NVPTX::Scope::Device:
791	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
792	: NVPTX::INT_MEMBAR_GL;
793	case NVPTX::Scope::Thread:
794	report_fatal_error(
795	reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
796	Vals: ScopeToString(S)));
797	}
798	break;
799	case NVPTX::Ordering::Release:
800	switch (S) {
801	case NVPTX::Scope::System:
802	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
803	: NVPTX::INT_MEMBAR_SYS;
804	case NVPTX::Scope::Block:
805	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
806	: NVPTX::INT_MEMBAR_CTA;
807	case NVPTX::Scope::Cluster:
808	return NVPTX::atomic_thread_fence_release_cluster;
809	case NVPTX::Scope::Device:
810	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
811	: NVPTX::INT_MEMBAR_GL;
812	case NVPTX::Scope::Thread:
813	report_fatal_error(
814	reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
815	Vals: ScopeToString(S)));
816	}
817	break;
818	case NVPTX::Ordering::AcquireRelease: {
819	switch (S) {
820	case NVPTX::Scope::System:
821	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
822	: NVPTX::INT_MEMBAR_SYS;
823	case NVPTX::Scope::Block:
824	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
825	: NVPTX::INT_MEMBAR_CTA;
826	case NVPTX::Scope::Cluster:
827	return NVPTX::atomic_thread_fence_acq_rel_cluster;
828	case NVPTX::Scope::Device:
829	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
830	: NVPTX::INT_MEMBAR_GL;
831	case NVPTX::Scope::Thread:
832	report_fatal_error(
833	reason: formatv(Fmt: "Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
834	Vals: ScopeToString(S)));
835	}
836	break;
837	}
838	case NVPTX::Ordering::SequentiallyConsistent: {
839	switch (S) {
840	case NVPTX::Scope::System:
841	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
842	: NVPTX::INT_MEMBAR_SYS;
843	case NVPTX::Scope::Block:
844	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
845	: NVPTX::INT_MEMBAR_CTA;
846	case NVPTX::Scope::Cluster:
847	return NVPTX::atomic_thread_fence_seq_cst_cluster;
848	case NVPTX::Scope::Device:
849	return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
850	: NVPTX::INT_MEMBAR_GL;
851	case NVPTX::Scope::Thread:
852	report_fatal_error(reason: formatv(Fmt: "Unsupported scope \"{}\" for seq_cst fence.",
853	Vals: ScopeToString(S)));
854	}
855	break;
856	}
857	case NVPTX::Ordering::NotAtomic:
858	case NVPTX::Ordering::Relaxed:
859	case NVPTX::Ordering::Volatile:
860	case NVPTX::Ordering::RelaxedMMIO:
861	report_fatal_error(
862	reason: formatv(Fmt: "Unsupported \"{}\" ordering and \"{}\" scope for fence.",
863	Vals: OrderingToString(Order: O), Vals: ScopeToString(S)));
864	}
865	llvm_unreachable("unhandled ordering");
866	}
867
868	// Returns Memory Order and Scope of a memory instruction, and
869	// inserts any fence before the instruction that's required to
870	// implement its memory ordering.
871	std::pair<NVPTX::Ordering, NVPTX::Scope>
872	NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
873	MemSDNode *N) {
874	auto [InstructionOrdering, FenceOrdering] =
875	getOperationOrderings(N, Subtarget);
876	auto Scope = getOperationScope(N, O: InstructionOrdering);
877
878	// If a fence is required before the operation, insert it:
879	switch (NVPTX::Ordering(FenceOrdering)) {
880	case NVPTX::Ordering::NotAtomic:
881	break;
882	case NVPTX::Ordering::SequentiallyConsistent: {
883	auto Op = getFenceOp(O: FenceOrdering, S: Scope, T: Subtarget);
884	Chain = SDValue (CurDAG->getMachineNode(Opcode: Op, dl: DL, VT: MVT::Other, Op1: Chain), `0`);
885	break;
886	}
887	default:
888	report_fatal_error(
889	reason: formatv(Fmt: "Unexpected fence ordering: \"{}\".",
890	Vals: OrderingToString(Order: NVPTX::Ordering(FenceOrdering))));
891	}
892	return {InstructionOrdering, Scope};
893	}
894
895	void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
896	SDValue Src = N->getOperand(Num: `0`);
897	AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(Val: N);
898	unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
899	unsigned DstAddrSpace = CastN->getDestAddressSpace();
900	SDLoc DL(N);
901	assert(SrcAddrSpace != DstAddrSpace &&
902	"addrspacecast must be between different address spaces");
903
904	if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
905	// Specific to generic
906
907	if (TM.is64Bit() && TM.getPointerSizeInBits(AS: SrcAddrSpace) == `32`) {
908	SDValue CvtNone =
909	CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, VT: MVT::i32);
910	SDNode *Cvt = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u64_u32, dl: DL, VT: MVT::i64,
911	Op1: Src, Op2: CvtNone);
912	Src = SDValue (Cvt, `0`);
913	}
914
915	unsigned Opc;
916	switch (SrcAddrSpace) {
917	default: report_fatal_error(reason: "Bad address space in addrspacecast");
918	case ADDRESS_SPACE_GLOBAL:
919	Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
920	break;
921	case ADDRESS_SPACE_SHARED:
922	Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
923	break;
924	case ADDRESS_SPACE_SHARED_CLUSTER:
925	if (!TM.is64Bit())
926	report_fatal_error(
927	reason: "Shared cluster address space is only supported in 64-bit mode");
928	Opc = NVPTX::cvta_shared_cluster_64;
929	break;
930	case ADDRESS_SPACE_CONST:
931	Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
932	break;
933	case ADDRESS_SPACE_LOCAL:
934	Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
935	break;
936	case ADDRESS_SPACE_PARAM:
937	Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param;
938	break;
939	}
940	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: N->getValueType(ResNo: `0`), Op1: Src));
941	return;
942	} else {
943	// Generic to specific
944	if (SrcAddrSpace != `0`)
945	report_fatal_error(reason: "Cannot cast between two non-generic address spaces");
946	unsigned Opc;
947	switch (DstAddrSpace) {
948	default: report_fatal_error(reason: "Bad address space in addrspacecast");
949	case ADDRESS_SPACE_GLOBAL:
950	Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
951	break;
952	case ADDRESS_SPACE_SHARED:
953	Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
954	break;
955	case ADDRESS_SPACE_SHARED_CLUSTER:
956	if (!TM.is64Bit())
957	report_fatal_error(
958	reason: "Shared cluster address space is only supported in 64-bit mode");
959	Opc = NVPTX::cvta_to_shared_cluster_64;
960	break;
961	case ADDRESS_SPACE_CONST:
962	Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
963	break;
964	case ADDRESS_SPACE_LOCAL:
965	Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
966	break;
967	case ADDRESS_SPACE_PARAM:
968	Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param;
969	break;
970	}
971
972	SDNode *CVTA = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: N->getValueType(ResNo: `0`), Op1: Src);
973	if (TM.is64Bit() && TM.getPointerSizeInBits(AS: DstAddrSpace) == `32`) {
974	SDValue CvtNone =
975	CurDAG->getTargetConstant(Val: NVPTX::PTXCvtMode::NONE, DL, VT: MVT::i32);
976	CVTA = CurDAG->getMachineNode(Opcode: NVPTX::CVT_u32_u64, dl: DL, VT: MVT::i32,
977	Op1: SDValue (CVTA, `0`), Op2: CvtNone);
978	}
979
980	ReplaceNode(F: N, T: CVTA);
981	return;
982	}
983	}
984
985	// Helper function template to reduce amount of boilerplate code for
986	// opcode selection.
987	static std::optional<unsigned>
988	pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i8,
989	std::optional<unsigned> Opcode_i16,
990	std::optional<unsigned> Opcode_i32,
991	std::optional<unsigned> Opcode_i64) {
992	switch (VT) {
993	case MVT::i1:
994	case MVT::i8:
995	return Opcode_i8;
996	case MVT::f16:
997	case MVT::i16:
998	case MVT::bf16:
999	return Opcode_i16;
1000	case MVT::v2f16:
1001	case MVT::v2bf16:
1002	case MVT::v2i16:
1003	case MVT::v4i8:
1004	case MVT::i32:
1005	case MVT::f32:
1006	return Opcode_i32;
1007	case MVT::i64:
1008	case MVT::f64:
1009	return Opcode_i64;
1010	default:
1011	return std::nullopt;
1012	}
1013	}
1014
1015	bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
1016	MemSDNode *LD = cast<MemSDNode>(Val: N);
1017	assert(LD->readMem() && "Expected load");
1018
1019	// do not support pre/post inc/dec
1020	const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(Val: LD);
1021	if (PlainLoad && PlainLoad->isIndexed())
1022	return false;
1023
1024	const EVT LoadedEVT = LD->getMemoryVT();
1025	if (!LoadedEVT.isSimple())
1026	return false;
1027	const MVT LoadedVT = LoadedEVT.getSimpleVT();
1028
1029	// Address Space Setting
1030	const unsigned CodeAddrSpace = getCodeAddrSpace(N: LD);
1031	if (canLowerToLDG(N: LD, Subtarget: Subtarget, CodeAddrSpace))
1032	return tryLDG(N: LD);
1033
1034	SDLoc DL(LD);
1035	SDValue Chain = N->getOperand(Num: `0`);
1036	const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: LD);
1037
1038	const unsigned FromTypeWidth = LoadedVT.getSizeInBits();
1039
1040	// Vector Setting
1041	const unsigned FromType =
1042	(PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
1043	? NVPTX::PTXLdStInstCode::Signed
1044	: NVPTX::PTXLdStInstCode::Untyped;
1045
1046	assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= `8` &&
1047	FromTypeWidth <= `128` && "Invalid width for load");
1048
1049	// Create the machine instruction DAG
1050	SDValue Offset, Base;
1051	SelectADDR(Addr: N->getOperand(Num: `1`), Base, Offset);
1052	SDValue Ops[] = {getI32Imm(Imm: Ordering, DL),
1053	getI32Imm(Imm: Scope, DL),
1054	getI32Imm(Imm: CodeAddrSpace, DL),
1055	getI32Imm(Imm: FromType, DL),
1056	getI32Imm(Imm: FromTypeWidth, DL),
1057	Base,
1058	Offset,
1059	Chain};
1060
1061	const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: `0`).SimpleTy;
1062	const std::optional<unsigned> Opcode = pickOpcodeForVT(
1063	VT: TargetVT, Opcode_i8: NVPTX::LD_i8, Opcode_i16: NVPTX::LD_i16, Opcode_i32: NVPTX::LD_i32, Opcode_i64: NVPTX::LD_i64);
1064	if (!Opcode)
1065	return false;
1066
1067	SDNode NVPTXLD = CurDAG->getMachineNode(Opcode: Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1068	if (!NVPTXLD)
1069	return false;
1070
1071	MachineMemOperand *MemRef = LD->getMemOperand();
1072	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef});
1073
1074	ReplaceNode(F: LD, T: NVPTXLD);
1075	return true;
1076	}
1077
1078	static unsigned getLoadStoreVectorNumElts(SDNode *N) {
1079	switch (N->getOpcode()) {
1080	case NVPTXISD::LoadV2:
1081	case NVPTXISD::StoreV2:
1082	return `2`;
1083	case NVPTXISD::LoadV4:
1084	case NVPTXISD::StoreV4:
1085	return `4`;
1086	case NVPTXISD::LoadV8:
1087	case NVPTXISD::StoreV8:
1088	return `8`;
1089	default:
1090	llvm_unreachable("Unexpected opcode");
1091	}
1092	}
1093
1094	bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1095	MemSDNode *LD = cast<MemSDNode>(Val: N);
1096	const EVT MemEVT = LD->getMemoryVT();
1097	if (!MemEVT.isSimple())
1098	return false;
1099	const MVT MemVT = MemEVT.getSimpleVT();
1100
1101	// Address Space Setting
1102	const unsigned CodeAddrSpace = getCodeAddrSpace(N: LD);
1103	if (canLowerToLDG(N: LD, Subtarget: Subtarget, CodeAddrSpace))
1104	return tryLDG(N: LD);
1105
1106	const MVT EltVT = LD->getSimpleValueType(ResNo: `0`);
1107	SDLoc DL(LD);
1108	SDValue Chain = LD->getChain();
1109	const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: LD);
1110
1111	// Type Setting: fromType + fromTypeWidth
1112	//
1113	// Sign : ISD::SEXTLOAD
1114	// Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1115	// type is integer
1116	// Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1117	// Read at least 8 bits (predicates are stored as 8-bit values)
1118	// The last operand holds the original LoadSDNode::getExtensionType() value
1119	const unsigned TotalWidth = MemVT.getSizeInBits();
1120	const unsigned ExtensionType =
1121	N->getConstantOperandVal(Num: N->getNumOperands() - `1`);
1122	const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1123	? NVPTX::PTXLdStInstCode::Signed
1124	: NVPTX::PTXLdStInstCode::Untyped;
1125
1126	const unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
1127
1128	assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
1129	assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= `8` &&
1130	FromTypeWidth <= `128` && TotalWidth <= `256` && "Invalid width for load");
1131
1132	SDValue Offset, Base;
1133	SelectADDR(Addr: N->getOperand(Num: `1`), Base, Offset);
1134	SDValue Ops[] = {getI32Imm(Imm: Ordering, DL),
1135	getI32Imm(Imm: Scope, DL),
1136	getI32Imm(Imm: CodeAddrSpace, DL),
1137	getI32Imm(Imm: FromType, DL),
1138	getI32Imm(Imm: FromTypeWidth, DL),
1139	Base,
1140	Offset,
1141	Chain};
1142
1143	std::optional<unsigned> Opcode;
1144	switch (N->getOpcode()) {
1145	default:
1146	llvm_unreachable("Unexpected opcode");
1147	case NVPTXISD::LoadV2:
1148	Opcode =
1149	pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i8: NVPTX::LDV_i8_v2, Opcode_i16: NVPTX::LDV_i16_v2,
1150	Opcode_i32: NVPTX::LDV_i32_v2, Opcode_i64: NVPTX::LDV_i64_v2);
1151	break;
1152	case NVPTXISD::LoadV4:
1153	Opcode =
1154	pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i8: NVPTX::LDV_i8_v4, Opcode_i16: NVPTX::LDV_i16_v4,
1155	Opcode_i32: NVPTX::LDV_i32_v4, Opcode_i64: NVPTX::LDV_i64_v4);
1156	break;
1157	case NVPTXISD::LoadV8:
1158	Opcode = pickOpcodeForVT(VT: EltVT.SimpleTy, Opcode_i8: {/ no v8i8 /}, Opcode_i16: {/ no v8i16 /},
1159	Opcode_i32: NVPTX::LDV_i32_v8, Opcode_i64: {/ no v8i64 /});
1160	break;
1161	}
1162	if (!Opcode)
1163	return false;
1164
1165	SDNode NVPTXLD = CurDAG->getMachineNode(Opcode: Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1166
1167	MachineMemOperand *MemRef = LD->getMemOperand();
1168	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXLD), NewMemRefs: {MemRef});
1169
1170	ReplaceNode(F: LD, T: NVPTXLD);
1171	return true;
1172	}
1173
1174	bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
1175	const EVT LoadedEVT = LD->getMemoryVT();
1176	if (!LoadedEVT.isSimple())
1177	return false;
1178	const MVT LoadedVT = LoadedEVT.getSimpleVT();
1179
1180	SDLoc DL(LD);
1181
1182	const unsigned TotalWidth = LoadedVT.getSizeInBits();
1183	unsigned ExtensionType;
1184	unsigned NumElts;
1185	if (const auto *Load = dyn_cast<LoadSDNode>(Val: LD)) {
1186	ExtensionType = Load->getExtensionType();
1187	NumElts = `1`;
1188	} else {
1189	ExtensionType = LD->getConstantOperandVal(Num: LD->getNumOperands() - `1`);
1190	NumElts = getLoadStoreVectorNumElts(N: LD);
1191	}
1192	const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
1193	? NVPTX::PTXLdStInstCode::Signed
1194	: NVPTX::PTXLdStInstCode::Untyped;
1195
1196	const unsigned FromTypeWidth = TotalWidth / NumElts;
1197
1198	assert(!(LD->getSimpleValueType(`0`).isVector() &&
1199	ExtensionType != ISD::NON_EXTLOAD));
1200	assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= `8` &&
1201	FromTypeWidth <= `128` && TotalWidth <= `256` && "Invalid width for load");
1202
1203	SDValue Base, Offset;
1204	SelectADDR(Addr: LD->getOperand(Num: `1`), Base, Offset);
1205	SDValue Ops[] = {getI32Imm(Imm: FromType, DL), getI32Imm(Imm: FromTypeWidth, DL), Base,
1206	Offset, LD->getChain()};
1207
1208	const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(ResNo: `0`).SimpleTy;
1209	std::optional<unsigned> Opcode;
1210	switch (LD->getOpcode()) {
1211	default:
1212	llvm_unreachable("Unexpected opcode");
1213	case ISD::LOAD:
1214	Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i8: NVPTX::LD_GLOBAL_NC_i8,
1215	Opcode_i16: NVPTX::LD_GLOBAL_NC_i16, Opcode_i32: NVPTX::LD_GLOBAL_NC_i32,
1216	Opcode_i64: NVPTX::LD_GLOBAL_NC_i64);
1217	break;
1218	case NVPTXISD::LoadV2:
1219	Opcode = pickOpcodeForVT(
1220	VT: TargetVT, Opcode_i8: NVPTX::LD_GLOBAL_NC_v2i8, Opcode_i16: NVPTX::LD_GLOBAL_NC_v2i16,
1221	Opcode_i32: NVPTX::LD_GLOBAL_NC_v2i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_v2i64);
1222	break;
1223	case NVPTXISD::LoadV4:
1224	Opcode = pickOpcodeForVT(
1225	VT: TargetVT, Opcode_i8: NVPTX::LD_GLOBAL_NC_v4i8, Opcode_i16: NVPTX::LD_GLOBAL_NC_v4i16,
1226	Opcode_i32: NVPTX::LD_GLOBAL_NC_v4i32, Opcode_i64: NVPTX::LD_GLOBAL_NC_v4i64);
1227	break;
1228	case NVPTXISD::LoadV8:
1229	Opcode = pickOpcodeForVT(VT: TargetVT, Opcode_i8: {/ no v8i8 /}, Opcode_i16: {/ no v8i16 /},
1230	Opcode_i32: NVPTX::LD_GLOBAL_NC_v8i32, Opcode_i64: {/ no v8i64 /});
1231	break;
1232	}
1233	if (!Opcode)
1234	return false;
1235
1236	SDNode NVPTXLDG = CurDAG->getMachineNode(Opcode: Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1237
1238	ReplaceNode(F: LD, T: NVPTXLDG);
1239	return true;
1240	}
1241
1242	bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
1243	auto *LD = cast<MemSDNode>(Val: N);
1244
1245	unsigned NumElts;
1246	switch (N->getOpcode()) {
1247	default:
1248	llvm_unreachable("Unexpected opcode");
1249	case ISD::INTRINSIC_W_CHAIN:
1250	NumElts = `1`;
1251	break;
1252	case NVPTXISD::LDUV2:
1253	NumElts = `2`;
1254	break;
1255	case NVPTXISD::LDUV4:
1256	NumElts = `4`;
1257	break;
1258	}
1259
1260	const MVT::SimpleValueType SelectVT =
1261	MVT::getIntegerVT(BitWidth: LD->getMemoryVT().getSizeInBits() / NumElts).SimpleTy;
1262
1263	// If this is an LDU intrinsic, the address is the third operand. If its an
1264	// LDU SD node (from custom vector handling), then its the second operand
1265	SDValue Addr =
1266	LD->getOperand(Num: LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? `2` : `1`);
1267
1268	SDValue Base, Offset;
1269	SelectADDR(Addr, Base, Offset);
1270	SDValue Ops[] = {Base, Offset, LD->getChain()};
1271
1272	std::optional<unsigned> Opcode;
1273	switch (N->getOpcode()) {
1274	default:
1275	llvm_unreachable("Unexpected opcode");
1276	case ISD::INTRINSIC_W_CHAIN:
1277	Opcode =
1278	pickOpcodeForVT(VT: SelectVT, Opcode_i8: NVPTX::LDU_GLOBAL_i8, Opcode_i16: NVPTX::LDU_GLOBAL_i16,
1279	Opcode_i32: NVPTX::LDU_GLOBAL_i32, Opcode_i64: NVPTX::LDU_GLOBAL_i64);
1280	break;
1281	case NVPTXISD::LDUV2:
1282	Opcode = pickOpcodeForVT(VT: SelectVT, Opcode_i8: NVPTX::LDU_GLOBAL_v2i8,
1283	Opcode_i16: NVPTX::LDU_GLOBAL_v2i16, Opcode_i32: NVPTX::LDU_GLOBAL_v2i32,
1284	Opcode_i64: NVPTX::LDU_GLOBAL_v2i64);
1285	break;
1286	case NVPTXISD::LDUV4:
1287	Opcode = pickOpcodeForVT(VT: SelectVT, Opcode_i8: NVPTX::LDU_GLOBAL_v4i8,
1288	Opcode_i16: NVPTX::LDU_GLOBAL_v4i16, Opcode_i32: NVPTX::LDU_GLOBAL_v4i32,
1289	Opcode_i64: {/ no v4i64 /});
1290	break;
1291	}
1292	if (!Opcode)
1293	return false;
1294
1295	SDLoc DL(N);
1296	SDNode NVPTXLDU = CurDAG->getMachineNode(Opcode: Opcode, dl: DL, VTs: LD->getVTList(), Ops);
1297
1298	ReplaceNode(F: LD, T: NVPTXLDU);
1299	return true;
1300	}
1301
1302	bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1303	MemSDNode *ST = cast<MemSDNode>(Val: N);
1304	assert(ST->writeMem() && "Expected store");
1305	StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(Val: ST);
1306	AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(Val: ST);
1307	assert((PlainStore \|\| AtomicStore) && "Expected store");
1308
1309	// do not support pre/post inc/dec
1310	if (PlainStore && PlainStore->isIndexed())
1311	return false;
1312
1313	const EVT StoreVT = ST->getMemoryVT();
1314	if (!StoreVT.isSimple())
1315	return false;
1316
1317	// Address Space Setting
1318	const unsigned CodeAddrSpace = getCodeAddrSpace(N: ST);
1319
1320	SDLoc DL(ST);
1321	SDValue Chain = ST->getChain();
1322	const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: ST);
1323
1324	// Vector Setting
1325	const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
1326
1327	// Create the machine instruction DAG
1328	SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1329
1330	assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= `8` && ToTypeWidth <= `128` &&
1331	"Invalid width for store");
1332
1333	SDValue Offset, Base;
1334	SelectADDR(Addr: ST->getBasePtr(), Base, Offset);
1335
1336	SDValue Ops[] = {selectPossiblyImm(V: Value),
1337	getI32Imm(Imm: Ordering, DL),
1338	getI32Imm(Imm: Scope, DL),
1339	getI32Imm(Imm: CodeAddrSpace, DL),
1340	getI32Imm(Imm: ToTypeWidth, DL),
1341	Base,
1342	Offset,
1343	Chain};
1344
1345	const std::optional<unsigned> Opcode =
1346	pickOpcodeForVT(VT: Value.getSimpleValueType().SimpleTy, Opcode_i8: NVPTX::ST_i8,
1347	Opcode_i16: NVPTX::ST_i16, Opcode_i32: NVPTX::ST_i32, Opcode_i64: NVPTX::ST_i64);
1348	if (!Opcode)
1349	return false;
1350
1351	SDNode NVPTXST = CurDAG->getMachineNode(Opcode: Opcode, dl: DL, VT: MVT::Other, Ops);
1352
1353	if (!NVPTXST)
1354	return false;
1355
1356	MachineMemOperand *MemRef = ST->getMemOperand();
1357	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef});
1358	ReplaceNode(F: ST, T: NVPTXST);
1359	return true;
1360	}
1361
1362	bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1363	MemSDNode *ST = cast<MemSDNode>(Val: N);
1364	const EVT StoreVT = ST->getMemoryVT();
1365	assert(StoreVT.isSimple() && "Store value is not simple");
1366
1367	// Address Space Setting
1368	const unsigned CodeAddrSpace = getCodeAddrSpace(N: ST);
1369	if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
1370	report_fatal_error(reason: "Cannot store to pointer that points to constant "
1371	"memory space");
1372	}
1373
1374	SDLoc DL(ST);
1375	SDValue Chain = ST->getChain();
1376	const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, N: ST);
1377
1378	// Type Setting: toType + toTypeWidth
1379	// - for integer type, always use 'u'
1380	const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
1381
1382	const unsigned NumElts = getLoadStoreVectorNumElts(N: ST);
1383
1384	SmallVector<SDValue, `16`> Ops;
1385	for (auto &V : ST->ops().slice(N: `1`, M: NumElts))
1386	Ops.push_back(Elt: selectPossiblyImm(V));
1387	SDValue Addr = N->getOperand(Num: NumElts + `1`);
1388	const unsigned ToTypeWidth = TotalWidth / NumElts;
1389
1390	assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= `8` && ToTypeWidth <= `128` &&
1391	TotalWidth <= `256` && "Invalid width for store");
1392
1393	SDValue Offset, Base;
1394	SelectADDR(Addr, Base, Offset);
1395
1396	Ops.append(IL: {getI32Imm(Imm: Ordering, DL), getI32Imm(Imm: Scope, DL),
1397	getI32Imm(Imm: CodeAddrSpace, DL), getI32Imm(Imm: ToTypeWidth, DL), Base,
1398	Offset, Chain});
1399
1400	const MVT::SimpleValueType EltVT =
1401	ST->getOperand(Num: `1`).getSimpleValueType().SimpleTy;
1402	std::optional<unsigned> Opcode;
1403	switch (ST->getOpcode()) {
1404	default:
1405	return false;
1406	case NVPTXISD::StoreV2:
1407	Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i8: NVPTX::STV_i8_v2, Opcode_i16: NVPTX::STV_i16_v2,
1408	Opcode_i32: NVPTX::STV_i32_v2, Opcode_i64: NVPTX::STV_i64_v2);
1409	break;
1410	case NVPTXISD::StoreV4:
1411	Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i8: NVPTX::STV_i8_v4, Opcode_i16: NVPTX::STV_i16_v4,
1412	Opcode_i32: NVPTX::STV_i32_v4, Opcode_i64: NVPTX::STV_i64_v4);
1413	break;
1414	case NVPTXISD::StoreV8:
1415	Opcode = pickOpcodeForVT(VT: EltVT, Opcode_i8: {/ no v8i8 /}, Opcode_i16: {/ no v8i16 /},
1416	Opcode_i32: NVPTX::STV_i32_v8, Opcode_i64: {/ no v8i64 /});
1417	break;
1418	}
1419
1420	if (!Opcode)
1421	return false;
1422
1423	SDNode NVPTXST = CurDAG->getMachineNode(Opcode: Opcode, dl: DL, VT: MVT::Other, Ops);
1424
1425	MachineMemOperand *MemRef = ST->getMemOperand();
1426	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: NVPTXST), NewMemRefs: {MemRef});
1427
1428	ReplaceNode(F: ST, T: NVPTXST);
1429	return true;
1430	}
1431
1432	bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
1433	SDValue Chain = Node->getOperand(Num: `0`);
1434	SDValue Offset = Node->getOperand(Num: `2`);
1435	SDValue Glue = Node->getOperand(Num: `3`);
1436	SDLoc DL(Node);
1437	MemSDNode *Mem = cast<MemSDNode>(Val: Node);
1438
1439	unsigned VecSize;
1440	switch (Node->getOpcode()) {
1441	default:
1442	return false;
1443	case NVPTXISD::LoadParam:
1444	VecSize = `1`;
1445	break;
1446	case NVPTXISD::LoadParamV2:
1447	VecSize = `2`;
1448	break;
1449	case NVPTXISD::LoadParamV4:
1450	VecSize = `4`;
1451	break;
1452	}
1453
1454	EVT EltVT = Node->getValueType(ResNo: `0`);
1455	EVT MemVT = Mem->getMemoryVT();
1456
1457	std::optional<unsigned> Opcode;
1458
1459	switch (VecSize) {
1460	default:
1461	return false;
1462	case `1`:
1463	Opcode = pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy,
1464	Opcode_i8: NVPTX::LoadParamMemI8, Opcode_i16: NVPTX::LoadParamMemI16,
1465	Opcode_i32: NVPTX::LoadParamMemI32, Opcode_i64: NVPTX::LoadParamMemI64);
1466	break;
1467	case `2`:
1468	Opcode =
1469	pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy, Opcode_i8: NVPTX::LoadParamMemV2I8,
1470	Opcode_i16: NVPTX::LoadParamMemV2I16, Opcode_i32: NVPTX::LoadParamMemV2I32,
1471	Opcode_i64: NVPTX::LoadParamMemV2I64);
1472	break;
1473	case `4`:
1474	Opcode = pickOpcodeForVT(VT: MemVT.getSimpleVT().SimpleTy,
1475	Opcode_i8: NVPTX::LoadParamMemV4I8, Opcode_i16: NVPTX::LoadParamMemV4I16,
1476	Opcode_i32: NVPTX::LoadParamMemV4I32, Opcode_i64: {/ no v4i64 /});
1477	break;
1478	}
1479	if (!Opcode)
1480	return false;
1481
1482	SDVTList VTs;
1483	if (VecSize == `1`) {
1484	VTs = CurDAG->getVTList(VT1: EltVT, VT2: MVT::Other, VT3: MVT::Glue);
1485	} else if (VecSize == `2`) {
1486	VTs = CurDAG->getVTList(VT1: EltVT, VT2: EltVT, VT3: MVT::Other, VT4: MVT::Glue);
1487	} else {
1488	EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
1489	VTs = CurDAG->getVTList(VTs: EVTs);
1490	}
1491
1492	unsigned OffsetVal = Offset ->getAsZExtVal();
1493
1494	SmallVector<SDValue, `2`> Ops(
1495	{CurDAG->getTargetConstant(Val: OffsetVal, DL, VT: MVT::i32), Chain, Glue});
1496
1497	ReplaceNode(F: Node, T: CurDAG->getMachineNode(Opcode: *Opcode, dl: DL, VTs, Ops));
1498	return true;
1499	}
1500
1501	// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
1502	#define getOpcV2H(ty, opKind0, opKind1) \
1503	NVPTX::StoreParamV2##ty##_##opKind0##opKind1
1504
1505	#define getOpcV2H1(ty, opKind0, isImm1) \
1506	(isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
1507
1508	#define getOpcodeForVectorStParamV2(ty, isimm) \
1509	(isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
1510
1511	#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
1512	NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
1513
1514	#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
1515	(isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
1516	: getOpcV4H(ty, opKind0, opKind1, opKind2, r)
1517
1518	#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
1519	(isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
1520	: getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
1521
1522	#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
1523	(isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
1524	: getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
1525
1526	#define getOpcodeForVectorStParamV4(ty, isimm) \
1527	(isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
1528	: getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
1529
1530	#define getOpcodeForVectorStParam(n, ty, isimm) \
1531	(n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
1532	: getOpcodeForVectorStParamV4(ty, isimm)
1533
1534	static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, `8`> &Ops,
1535	unsigned NumElts,
1536	MVT::SimpleValueType MemTy,
1537	SelectionDAG *CurDAG, SDLoc DL) {
1538	// Determine which inputs are registers and immediates make new operators
1539	// with constant values
1540	SmallVector<bool, `4`> IsImm(NumElts, false);
1541	for (unsigned i = `0`; i < NumElts; i++) {
1542	IsImm [i] = (isa<ConstantSDNode>(Val: Ops [i]) \|\| isa<ConstantFPSDNode>(Val: Ops [i]));
1543	if (IsImm [i]) {
1544	SDValue Imm = Ops [i];
1545	if (MemTy == MVT::f32 \|\| MemTy == MVT::f64) {
1546	const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Val&: Imm);
1547	const ConstantFP *CF = ConstImm->getConstantFPValue();
1548	Imm = CurDAG->getTargetConstantFP(Val: *CF, DL, VT: Imm ->getValueType(ResNo: `0`));
1549	} else {
1550	const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Val&: Imm);
1551	const ConstantInt *CI = ConstImm->getConstantIntValue();
1552	Imm = CurDAG->getTargetConstant(Val: *CI, DL, VT: Imm ->getValueType(ResNo: `0`));
1553	}
1554	Ops [i] = Imm;
1555	}
1556	}
1557
1558	// Get opcode for MemTy, size, and register/immediate operand ordering
1559	switch (MemTy) {
1560	case MVT::i8:
1561	return getOpcodeForVectorStParam(NumElts, I8, IsImm);
1562	case MVT::i16:
1563	return getOpcodeForVectorStParam(NumElts, I16, IsImm);
1564	case MVT::i32:
1565	return getOpcodeForVectorStParam(NumElts, I32, IsImm);
1566	case MVT::i64:
1567	assert(NumElts == `2` && "MVT too large for NumElts > 2");
1568	return getOpcodeForVectorStParamV2(I64, IsImm);
1569	case MVT::f32:
1570	return getOpcodeForVectorStParam(NumElts, F32, IsImm);
1571	case MVT::f64:
1572	assert(NumElts == `2` && "MVT too large for NumElts > 2");
1573	return getOpcodeForVectorStParamV2(F64, IsImm);
1574
1575	// These cases don't support immediates, just use the all register version
1576	// and generate moves.
1577	case MVT::i1:
1578	return (NumElts == `2`) ? NVPTX::StoreParamV2I8_rr
1579	: NVPTX::StoreParamV4I8_rrrr;
1580	case MVT::f16:
1581	case MVT::bf16:
1582	return (NumElts == `2`) ? NVPTX::StoreParamV2I16_rr
1583	: NVPTX::StoreParamV4I16_rrrr;
1584	case MVT::v2f16:
1585	case MVT::v2bf16:
1586	case MVT::v2i16:
1587	case MVT::v4i8:
1588	return (NumElts == `2`) ? NVPTX::StoreParamV2I32_rr
1589	: NVPTX::StoreParamV4I32_rrrr;
1590	default:
1591	llvm_unreachable("Cannot select st.param for unknown MemTy");
1592	}
1593	}
1594
1595	bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
1596	SDLoc DL(N);
1597	SDValue Chain = N->getOperand(Num: `0`);
1598	SDValue Param = N->getOperand(Num: `1`);
1599	unsigned ParamVal = Param ->getAsZExtVal();
1600	SDValue Offset = N->getOperand(Num: `2`);
1601	unsigned OffsetVal = Offset ->getAsZExtVal();
1602	MemSDNode *Mem = cast<MemSDNode>(Val: N);
1603	SDValue Glue = N->getOperand(Num: N->getNumOperands() - `1`);
1604
1605	// How many elements do we have?
1606	unsigned NumElts;
1607	switch (N->getOpcode()) {
1608	default:
1609	llvm_unreachable("Unexpected opcode");
1610	case NVPTXISD::StoreParam:
1611	NumElts = `1`;
1612	break;
1613	case NVPTXISD::StoreParamV2:
1614	NumElts = `2`;
1615	break;
1616	case NVPTXISD::StoreParamV4:
1617	NumElts = `4`;
1618	break;
1619	}
1620
1621	// Build vector of operands
1622	SmallVector<SDValue, `8`> Ops;
1623	for (unsigned i = `0`; i < NumElts; ++i)
1624	Ops.push_back(Elt: N->getOperand(Num: i + `3`));
1625	Ops.append(IL: {CurDAG->getTargetConstant(Val: ParamVal, DL, VT: MVT::i32),
1626	CurDAG->getTargetConstant(Val: OffsetVal, DL, VT: MVT::i32), Chain, Glue});
1627
1628	// Determine target opcode
1629	// If we have an i1, use an 8-bit store. The lowering code in
1630	// NVPTXISelLowering will have already emitted an upcast.
1631	std::optional<unsigned> Opcode;
1632	switch (NumElts) {
1633	default:
1634	llvm_unreachable("Unexpected NumElts");
1635	case `1`: {
1636	MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
1637	SDValue Imm = Ops [`0`];
1638	if (MemTy != MVT::f16 && MemTy != MVT::bf16 &&
1639	(isa<ConstantSDNode>(Val: Imm) \|\| isa<ConstantFPSDNode>(Val: Imm))) {
1640	// Convert immediate to target constant
1641	if (MemTy == MVT::f32 \|\| MemTy == MVT::f64) {
1642	const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Val&: Imm);
1643	const ConstantFP *CF = ConstImm->getConstantFPValue();
1644	Imm = CurDAG->getTargetConstantFP(Val: *CF, DL, VT: Imm ->getValueType(ResNo: `0`));
1645	} else {
1646	const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Val&: Imm);
1647	const ConstantInt *CI = ConstImm->getConstantIntValue();
1648	Imm = CurDAG->getTargetConstant(Val: *CI, DL, VT: Imm ->getValueType(ResNo: `0`));
1649	}
1650	Ops [`0`] = Imm;
1651	// Use immediate version of store param
1652	Opcode =
1653	pickOpcodeForVT(VT: MemTy, Opcode_i8: NVPTX::StoreParamI8_i, Opcode_i16: NVPTX::StoreParamI16_i,
1654	Opcode_i32: NVPTX::StoreParamI32_i, Opcode_i64: NVPTX::StoreParamI64_i);
1655	} else
1656	Opcode = pickOpcodeForVT(VT: Mem->getMemoryVT().getSimpleVT().SimpleTy,
1657	Opcode_i8: NVPTX::StoreParamI8_r, Opcode_i16: NVPTX::StoreParamI16_r,
1658	Opcode_i32: NVPTX::StoreParamI32_r, Opcode_i64: NVPTX::StoreParamI64_r);
1659	if (Opcode == NVPTX::StoreParamI8_r) {
1660	// Fine tune the opcode depending on the size of the operand.
1661	// This helps to avoid creating redundant COPY instructions in
1662	// InstrEmitter::AddRegisterOperand().
1663	switch (Ops [`0`].getSimpleValueType().SimpleTy) {
1664	default:
1665	break;
1666	case MVT::i32:
1667	Opcode = NVPTX::StoreParamI8TruncI32_r;
1668	break;
1669	case MVT::i64:
1670	Opcode = NVPTX::StoreParamI8TruncI64_r;
1671	break;
1672	}
1673	}
1674	break;
1675	}
1676	case `2`:
1677	case `4`: {
1678	MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy;
1679	Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
1680	break;
1681	}
1682	}
1683
1684	SDVTList RetVTs = CurDAG->getVTList(VT1: MVT::Other, VT2: MVT::Glue);
1685	SDNode Ret = CurDAG->getMachineNode(Opcode: Opcode, dl: DL, VTs: RetVTs, Ops);
1686	MachineMemOperand *MemRef = cast<MemSDNode>(Val: N)->getMemOperand();
1687	CurDAG->setNodeMemRefs(N: cast<MachineSDNode>(Val: Ret), NewMemRefs: {MemRef});
1688
1689	ReplaceNode(F: N, T: Ret);
1690	return true;
1691	}
1692
1693	/// SelectBFE - Look for instruction sequences that can be made more efficient
1694	/// by using the 'bfe' (bit-field extract) PTX instruction
1695	bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
1696	SDLoc DL(N);
1697	SDValue LHS = N->getOperand(Num: `0`);
1698	SDValue RHS = N->getOperand(Num: `1`);
1699	SDValue Len;
1700	SDValue Start;
1701	SDValue Val;
1702	bool IsSigned = false;
1703
1704	if (N->getOpcode() == ISD::AND) {
1705	// Canonicalize the operands
1706	// We want 'and %val, %mask'
1707	if (isa<ConstantSDNode>(Val: LHS) && !isa<ConstantSDNode>(Val: RHS)) {
1708	std::swap(a&: LHS, b&: RHS);
1709	}
1710
1711	ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val&: RHS);
1712	if (!Mask) {
1713	// We need a constant mask on the RHS of the AND
1714	return false;
1715	}
1716
1717	// Extract the mask bits
1718	uint64_t MaskVal = Mask->getZExtValue();
1719	if (!isMask_64(Value: MaskVal)) {
1720	// We could* handle shifted masks here, but doing so would require an*
1721	// 'and' operation to fix up the low-order bits so we would trade
1722	// shr+and for bfe+and, which has the same throughput
1723	return false;
1724	}
1725
1726	// How many bits are in our mask?
1727	int64_t NumBits = countr_one(Value: MaskVal);
1728	Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32);
1729
1730	if (LHS.getOpcode() == ISD::SRL \|\| LHS.getOpcode() == ISD::SRA) {
1731	// We have a 'srl/and' pair, extract the effective start bit and length
1732	Val = LHS.getNode()->getOperand(Num: `0`);
1733	Start = LHS.getNode()->getOperand(Num: `1`);
1734	ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Val&: Start);
1735	if (StartConst) {
1736	uint64_t StartVal = StartConst->getZExtValue();
1737	// How many "good" bits do we have left? "good" is defined here as bits
1738	// that exist in the original value, not shifted in.
1739	int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
1740	if (NumBits > GoodBits) {
1741	// Do not handle the case where bits have been shifted in. In theory
1742	// we could handle this, but the cost is likely higher than just
1743	// emitting the srl/and pair.
1744	return false;
1745	}
1746	Start = CurDAG->getTargetConstant(Val: StartVal, DL, VT: MVT::i32);
1747	} else {
1748	// Do not handle the case where the shift amount (can be zero if no srl
1749	// was found) is not constant. We could handle this case, but it would
1750	// require run-time logic that would be more expensive than just
1751	// emitting the srl/and pair.
1752	return false;
1753	}
1754	} else {
1755	// Do not handle the case where the LHS of the and is not a shift. While
1756	// it would be trivial to handle this case, it would just transform
1757	// 'and' -> 'bfe', but 'and' has higher-throughput.
1758	return false;
1759	}
1760	} else if (N->getOpcode() == ISD::SRL \|\| N->getOpcode() == ISD::SRA) {
1761	if (LHS ->getOpcode() == ISD::AND) {
1762	ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(Val&: RHS);
1763	if (!ShiftCnst) {
1764	// Shift amount must be constant
1765	return false;
1766	}
1767
1768	uint64_t ShiftAmt = ShiftCnst->getZExtValue();
1769
1770	SDValue AndLHS = LHS ->getOperand(Num: `0`);
1771	SDValue AndRHS = LHS ->getOperand(Num: `1`);
1772
1773	// Canonicalize the AND to have the mask on the RHS
1774	if (isa<ConstantSDNode>(Val: AndLHS)) {
1775	std::swap(a&: AndLHS, b&: AndRHS);
1776	}
1777
1778	ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: AndRHS);
1779	if (!MaskCnst) {
1780	// Mask must be constant
1781	return false;
1782	}
1783
1784	uint64_t MaskVal = MaskCnst->getZExtValue();
1785	uint64_t NumZeros;
1786	uint64_t NumBits;
1787	if (isMask_64(Value: MaskVal)) {
1788	NumZeros = `0`;
1789	// The number of bits in the result bitfield will be the number of
1790	// trailing ones (the AND) minus the number of bits we shift off
1791	NumBits = llvm::countr_one(Value: MaskVal) - ShiftAmt;
1792	} else if (isShiftedMask_64(Value: MaskVal)) {
1793	NumZeros = llvm::countr_zero(Val: MaskVal);
1794	unsigned NumOnes = llvm::countr_one(Value: MaskVal >> NumZeros);
1795	// The number of bits in the result bitfield will be the number of
1796	// trailing zeros plus the number of set bits in the mask minus the
1797	// number of bits we shift off
1798	NumBits = NumZeros + NumOnes - ShiftAmt;
1799	} else {
1800	// This is not a mask we can handle
1801	return false;
1802	}
1803
1804	if (ShiftAmt < NumZeros) {
1805	// Handling this case would require extra logic that would make this
1806	// transformation non-profitable
1807	return false;
1808	}
1809
1810	Val = AndLHS;
1811	Start = CurDAG->getTargetConstant(Val: ShiftAmt, DL, VT: MVT::i32);
1812	Len = CurDAG->getTargetConstant(Val: NumBits, DL, VT: MVT::i32);
1813
1814	// If pre-shift AND includes the sign bit in the bitfield, we must use
1815	// signed BFE to replicate that bit during bitfield extraction. If the
1816	// sign bit is not part of the mask, unsigned BFE will zero out upper bits
1817	// of the result
1818	if (N->getOpcode() == ISD::SRA)
1819	IsSigned = (ShiftAmt + NumBits) == Val.getValueSizeInBits();
1820	} else if (LHS ->getOpcode() == ISD::SHL) {
1821	// Here, we have a pattern like:
1822	//
1823	// (sra (shl val, NN), MM)
1824	// or
1825	// (srl (shl val, NN), MM)
1826	//
1827	// If MM >= NN, we can efficiently optimize this with bfe
1828	Val = LHS ->getOperand(Num: `0`);
1829
1830	SDValue ShlRHS = LHS ->getOperand(Num: `1`);
1831	ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(Val&: ShlRHS);
1832	if (!ShlCnst) {
1833	// Shift amount must be constant
1834	return false;
1835	}
1836	uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
1837
1838	SDValue ShrRHS = RHS;
1839	ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(Val&: ShrRHS);
1840	if (!ShrCnst) {
1841	// Shift amount must be constant
1842	return false;
1843	}
1844	uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
1845
1846	// To avoid extra codegen and be profitable, we need Outer >= Inner
1847	if (OuterShiftAmt < InnerShiftAmt) {
1848	return false;
1849	}
1850
1851	// If the outer shift is more than the type size, we have no bitfield to
1852	// extract (since we also check that the inner shift is <= the outer shift
1853	// then this also implies that the inner shift is < the type size)
1854	if (OuterShiftAmt >= Val.getValueSizeInBits()) {
1855	return false;
1856	}
1857
1858	Start = CurDAG->getTargetConstant(Val: OuterShiftAmt - InnerShiftAmt, DL,
1859	VT: MVT::i32);
1860	Len = CurDAG->getTargetConstant(Val: Val.getValueSizeInBits() - OuterShiftAmt,
1861	DL, VT: MVT::i32);
1862
1863	if (N->getOpcode() == ISD::SRA) {
1864	// If we have a arithmetic right shift, we need to use the signed bfe
1865	// variant
1866	IsSigned = true;
1867	}
1868	} else {
1869	// No can do...
1870	return false;
1871	}
1872	} else {
1873	// No can do...
1874	return false;
1875	}
1876
1877
1878	unsigned Opc;
1879	// For the BFE operations we form here from "and" and "srl", always use the
1880	// unsigned variants.
1881	if (Val.getValueType() == MVT::i32) {
1882	if (IsSigned) {
1883	Opc = NVPTX::BFE_S32rii;
1884	} else {
1885	Opc = NVPTX::BFE_U32rii;
1886	}
1887	} else if (Val.getValueType() == MVT::i64) {
1888	if (IsSigned) {
1889	Opc = NVPTX::BFE_S64rii;
1890	} else {
1891	Opc = NVPTX::BFE_U64rii;
1892	}
1893	} else {
1894	// We cannot handle this type
1895	return false;
1896	}
1897
1898	SDValue Ops[] = {
1899	Val, Start, Len
1900	};
1901
1902	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs: N->getVTList(), Ops));
1903	return true;
1904	}
1905
1906	// Select bf16/bf16v2 FADD, FSUB, FMUL as fma on targets with only fma
1907	bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
1908	EVT VT = SDValue (N, `0`).getValueType();
1909	if (VT.getScalarType() != MVT::bf16)
1910	return false;
1911
1912	const NVPTXSubtarget *STI = TM.getSubtargetImpl();
1913	if (STI->hasNativeBF16Support(Opcode: N->getOpcode()))
1914	return false;
1915
1916	const bool IsVec = VT.isVector();
1917	assert(!IsVec \|\| VT.getVectorNumElements() == `2`);
1918	SDLoc DL(N);
1919	SDValue N0 = N->getOperand(Num: `0`);
1920	SDValue N1 = N->getOperand(Num: `1`);
1921	SmallVector<SDValue, `3`> Operands;
1922	auto GetConstant = [&](float Value) -> SDValue {
1923	// BF16 immediates must be legalized to integer register values
1924	APFloat APF(Value);
1925	bool LosesInfo;
1926	APF.convert(ToSemantics: APFloat::BFloat(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
1927	assert(!LosesInfo);
1928	if (IsVec) {
1929	auto API = APF.bitcastToAPInt();
1930	API = API.concat(NewLSB: API);
1931	auto Const = CurDAG->getTargetConstant(Val: API, DL, VT: MVT::i32);
1932	return SDValue (CurDAG->getMachineNode(Opcode: NVPTX::IMOV32i, dl: DL, VT, Op1: Const), `0`);
1933	}
1934	auto Const = CurDAG->getTargetConstantFP(Val: APF, DL, VT);
1935	return SDValue (CurDAG->getMachineNode(Opcode: NVPTX::BFMOV16i, dl: DL, VT, Op1: Const), `0`);
1936	};
1937
1938	switch (N->getOpcode()) {
1939	case ISD::FADD:
1940	// add(a, b) -> fma(a, 1.0, b)
1941	Operands = {N0, GetConstant (`1.0`), N1};
1942	break;
1943	case ISD::FSUB:
1944	// sub(a, b) -> fma(b, -1.0, a)
1945	Operands = {N1, GetConstant (-`1.0`), N0};
1946	break;
1947	case ISD::FMUL:
1948	// mul(a, b) -> fma(a, b, -0.0)
1949	// NOTE: The identity is -0, not 0, because -0 + 0 == 0 for floats
1950	Operands = {N0, N1, GetConstant (-`0.0`)};
1951	break;
1952	default:
1953	llvm_unreachable("Unexpected opcode");
1954	};
1955
1956	int Opcode = IsVec ? NVPTX::BFMA16x2rrr : NVPTX::BFMA16rrr;
1957	MachineSDNode *FMA = CurDAG->getMachineNode(Opcode, dl: DL, VT, Ops: Operands);
1958	ReplaceNode(F: N, T: FMA);
1959	return true;
1960	}
1961
1962	static inline bool isAddLike(const SDValue V) {
1963	return V.getOpcode() == ISD::ADD \|\|
1964	(V ->getOpcode() == ISD::OR && V ->getFlags().hasDisjoint());
1965	}
1966
1967	// selectBaseADDR - Match a dag node which will serve as the base address for an
1968	// ADDR operand pair.
1969	static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
1970	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N))
1971	return DAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc (N),
1972	VT: GA->getValueType(ResNo: `0`), offset: GA->getOffset(),
1973	TargetFlags: GA->getTargetFlags());
1974	if (const auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: N))
1975	return DAG->getTargetExternalSymbol(Sym: ES->getSymbol(), VT: ES->getValueType(ResNo: `0`),
1976	TargetFlags: ES->getTargetFlags());
1977	if (const auto *FIN = dyn_cast<FrameIndexSDNode>(Val&: N))
1978	return DAG->getTargetFrameIndex(FI: FIN->getIndex(), VT: FIN->getValueType(ResNo: `0`));
1979
1980	return N;
1981	}
1982
1983	static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
1984	APInt AccumulatedOffset(`64u`, `0`);
1985	while (isAddLike(V: Addr)) {
1986	const auto *CN = dyn_cast<ConstantSDNode>(Val: Addr.getOperand(i: `1`));
1987	if (!CN)
1988	break;
1989
1990	const APInt CI = CN->getAPIntValue().sext(width: `64`);
1991	if (!(CI + AccumulatedOffset).isSignedIntN(N: `32`))
1992	break;
1993
1994	AccumulatedOffset += CI;
1995	Addr = Addr ->getOperand(Num: `0`);
1996	}
1997	return DAG->getSignedTargetConstant(Val: AccumulatedOffset.getSExtValue(), DL,
1998	VT: MVT::i32);
1999	}
2000
2001	// Select a pair of operands which represent a valid PTX address, this could be
2002	// one of the following things:
2003	// - [var] - Offset is simply set to 0
2004	// - [reg] - Offset is simply set to 0
2005	// - [reg+immOff]
2006	// - [var+immOff]
2007	// Note that immOff must fit into a 32-bit signed integer.
2008	bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
2009	SDValue &Offset) {
2010	Offset = accumulateOffset(Addr, DL: SDLoc (Addr), DAG: CurDAG);
2011	Base = selectBaseADDR(N: Addr, DAG: CurDAG);
2012	return true;
2013	}
2014
2015	SDValue NVPTXDAGToDAGISel::selectPossiblyImm(SDValue V) {
2016	if (V.getOpcode() == ISD::BITCAST)
2017	V = V.getOperand(i: `0`);
2018
2019	if (auto *CN = dyn_cast<ConstantSDNode>(Val&: V))
2020	return CurDAG->getTargetConstant(Val: CN->getAPIntValue(), DL: SDLoc (V),
2021	VT: V.getValueType());
2022	if (auto *CN = dyn_cast<ConstantFPSDNode>(Val&: V))
2023	return CurDAG->getTargetConstantFP(Val: CN->getValueAPF(), DL: SDLoc (V),
2024	VT: V.getValueType());
2025	return V;
2026	}
2027
2028	bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
2029	unsigned int spN) const {
2030	const Value Src = nullptr*;
2031	if (MemSDNode *mN = dyn_cast<MemSDNode>(Val: N)) {
2032	if (spN == `0` && mN->getMemOperand()->getPseudoValue())
2033	return true;
2034	Src = mN->getMemOperand()->getValue();
2035	}
2036	if (!Src)
2037	return false;
2038	if (auto *PT = dyn_cast<PointerType>(Val: Src->getType()))
2039	return (PT->getAddressSpace() == spN);
2040	return false;
2041	}
2042
2043	/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
2044	/// inline asm expressions.
2045	bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
2046	const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
2047	std::vector<SDValue> &OutOps) {
2048	SDValue Op0, Op1;
2049	switch (ConstraintID) {
2050	default:
2051	return true;
2052	case InlineAsm::ConstraintCode::m: // memory
2053	if (SelectADDR(Addr: Op, Base&: Op0, Offset&: Op1)) {
2054	OutOps.push_back(x: Op0);
2055	OutOps.push_back(x: Op1);
2056	return false;
2057	}
2058	break;
2059	}
2060	return true;
2061	}
2062
2063	void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) {
2064	// Lower a CopyToReg with two 64-bit inputs
2065	// Dst:i128, lo:i64, hi:i64
2066	//
2067	// CopyToReg Dst, lo, hi;
2068	//
2069	// ==>
2070	//
2071	// tmp = V2I64toI128 {lo, hi};
2072	// CopyToReg Dst, tmp;
2073	SDValue Dst = N->getOperand(Num: `1`);
2074	SDValue Lo = N->getOperand(Num: `2`);
2075	SDValue Hi = N->getOperand(Num: `3`);
2076
2077	SDLoc DL(N);
2078	SDNode *Mov =
2079	CurDAG->getMachineNode(Opcode: NVPTX::V2I64toI128, dl: DL, VT: MVT::i128, Ops: {Lo, Hi});
2080
2081	SmallVector<SDValue, `4`> NewOps(N->getNumOperands() - `1`);
2082	NewOps [`0`] = N->getOperand(Num: `0`);
2083	NewOps [`1`] = Dst;
2084	NewOps [`2`] = SDValue (Mov, `0`);
2085	if (N->getNumOperands() == `5`)
2086	NewOps [`3`] = N->getOperand(Num: `4`);
2087	SDValue NewValue = CurDAG->getNode(Opcode: ISD::CopyToReg, DL, ResultTys: SmallVector<EVT>(N->values()), Ops: NewOps);
2088
2089	ReplaceNode(F: N, T: NewValue.getNode());
2090	}
2091
2092	void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
2093	// Lower CopyFromReg from a 128-bit regs to two 64-bit regs
2094	// Dst:i128, Src:i128
2095	//
2096	// {lo, hi} = CopyFromReg Src
2097	//
2098	// ==>
2099	//
2100	// {lo, hi} = I128toV2I64 Src
2101	//
2102	SDValue Ch = N->getOperand(Num: `0`);
2103	SDValue Src = N->getOperand(Num: `1`);
2104	SDValue Glue = N->getOperand(Num: `2`);
2105	SDLoc DL(N);
2106
2107	// Add Glue and Ch to the operands and results to avoid break the execution
2108	// order
2109	SDNode *Mov = CurDAG->getMachineNode(
2110	Opcode: NVPTX::I128toV2I64, dl: DL,
2111	ResultTys: {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()},
2112	Ops: {Src, Ch, Glue});
2113
2114	ReplaceNode(F: N, T: Mov);
2115	}
2116
2117	bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
2118	SDLoc DL(N);
2119	assert(N->getOpcode() == ISD::ATOMIC_FENCE);
2120	unsigned int FenceOp =
2121	getFenceOp(O: NVPTX::Ordering(N->getConstantOperandVal(Num: `1`)),
2122	S: Scopes [N->getConstantOperandVal(Num: `2`)], T: Subtarget);
2123	SDValue Chain = N->getOperand(Num: `0`);
2124	SDNode *FenceNode = CurDAG->getMachineNode(Opcode: FenceOp, dl: DL, VT: MVT::Other, Op1: Chain);
2125	ReplaceNode(F: N, T: FenceNode);
2126	return true;
2127	}
2128
2129	NVPTXScopes::NVPTXScopes(LLVMContext &C) {
2130	Scopes [C.getOrInsertSyncScopeID(SSN: "singlethread")] = NVPTX::Scope::Thread;
2131	Scopes [C.getOrInsertSyncScopeID(SSN: "")] = NVPTX::Scope::System;
2132	Scopes [C.getOrInsertSyncScopeID(SSN: "block")] = NVPTX::Scope::Block;
2133	Scopes [C.getOrInsertSyncScopeID(SSN: "cluster")] = NVPTX::Scope::Cluster;
2134	Scopes [C.getOrInsertSyncScopeID(SSN: "device")] = NVPTX::Scope::Device;
2135	}
2136
2137	NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
2138	if (Scopes.empty())
2139	llvm_unreachable("NVPTX Scopes must be initialized before calling "
2140	"NVPTXScopes::operator[]");
2141
2142	auto S = Scopes.find(Key: ID);
2143	if (S == Scopes.end()) {
2144	// TODO:
2145	// - Add API to LLVMContext to get the name of a single scope.
2146	// - Use that API here to print an error containing the name
2147	// of this Unknown ID.
2148	report_fatal_error(reason: formatv(Fmt: "Could not find scope ID={}.", Vals: int(ID)));
2149	}
2150	return S->second;
2151	}
2152
2153	bool NVPTXScopes::empty() const { return Scopes.size() == `0`; }
2154
2155	#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, is_s32, suffix) \
2156	(is_s32 \
2157	? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
2158	: NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
2159
2160	#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \
2161	(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2162	: (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2163
2164	#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \
2165	is_s32) \
2166	(is_reduce \
2167	? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2168	: (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2169	is_s32)))
2170
2171	#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
2172	[&]() -> auto { \
2173	if (is_mc && is_ch) \
2174	return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
2175	if (is_ch) \
2176	return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
2177	if (is_mc) \
2178	return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
2179	return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
2180	}()
2181
2182	#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \
2183	(is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2184	: NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2185
2186	static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
2187	bool IsCacheHint, bool IsIm2Col,
2188	bool IsReduce = false) {
2189	if (IsIm2Col) {
2190	switch (Dim) {
2191	case `3`:
2192	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(`3D`, IM2COL, IsReduce,
2193	IsCacheHint, IsShared32);
2194	case `4`:
2195	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(`4D`, IM2COL, IsReduce,
2196	IsCacheHint, IsShared32);
2197	case `5`:
2198	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(`5D`, IM2COL, IsReduce,
2199	IsCacheHint, IsShared32);
2200	default:
2201	llvm_unreachable("Invalid Dimension in im2col mode for "
2202	"GetCpAsyncBulkTensorS2GOpcode.");
2203	}
2204	} else {
2205	switch (Dim) {
2206	case `1`:
2207	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(`1D`, TILE, IsReduce,
2208	IsCacheHint, IsShared32);
2209	case `2`:
2210	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(`2D`, TILE, IsReduce,
2211	IsCacheHint, IsShared32);
2212	case `3`:
2213	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(`3D`, TILE, IsReduce,
2214	IsCacheHint, IsShared32);
2215	case `4`:
2216	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(`4D`, TILE, IsReduce,
2217	IsCacheHint, IsShared32);
2218	case `5`:
2219	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(`5D`, TILE, IsReduce,
2220	IsCacheHint, IsShared32);
2221	default:
2222	llvm_unreachable(
2223	"Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
2224	}
2225	}
2226	}
2227
2228	static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
2229	bool IsMultiCast,
2230	bool IsCacheHint, bool IsIm2Col) {
2231	if (IsIm2Col) {
2232	switch (Dim) {
2233	case `3`:
2234	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(`3D`, IM2COL, IsMultiCast,
2235	IsCacheHint, IsShared32);
2236	case `4`:
2237	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(`4D`, IM2COL, IsMultiCast,
2238	IsCacheHint, IsShared32);
2239	case `5`:
2240	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(`5D`, IM2COL, IsMultiCast,
2241	IsCacheHint, IsShared32);
2242	default:
2243	llvm_unreachable("Invalid Dimension in im2col mode for "
2244	"GetCpAsyncBulkTensorG2SOpcode.");
2245	}
2246	} else {
2247	switch (Dim) {
2248	case `1`:
2249	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(`1D`, TILE, IsMultiCast,
2250	IsCacheHint, IsShared32);
2251	case `2`:
2252	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(`2D`, TILE, IsMultiCast,
2253	IsCacheHint, IsShared32);
2254	case `3`:
2255	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(`3D`, TILE, IsMultiCast,
2256	IsCacheHint, IsShared32);
2257	case `4`:
2258	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(`4D`, TILE, IsMultiCast,
2259	IsCacheHint, IsShared32);
2260	case `5`:
2261	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(`5D`, TILE, IsMultiCast,
2262	IsCacheHint, IsShared32);
2263	default:
2264	llvm_unreachable(
2265	"Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
2266	}
2267	}
2268	}
2269
2270	static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
2271	bool IsIm2Col) {
2272	if (IsIm2Col) {
2273	switch (Dim) {
2274	case `3`:
2275	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(`3D`, IM2COL, IsCacheHint);
2276	case `4`:
2277	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(`4D`, IM2COL, IsCacheHint);
2278	case `5`:
2279	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(`5D`, IM2COL, IsCacheHint);
2280	default:
2281	llvm_unreachable("Invalid Dimension in im2col mode for "
2282	"GetCpAsyncBulkTensorPrefetchOpcode.");
2283	}
2284	} else {
2285	switch (Dim) {
2286	case `1`:
2287	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(`1D`, TILE, IsCacheHint);
2288	case `2`:
2289	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(`2D`, TILE, IsCacheHint);
2290	case `3`:
2291	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(`3D`, TILE, IsCacheHint);
2292	case `4`:
2293	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(`4D`, TILE, IsCacheHint);
2294	case `5`:
2295	return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(`5D`, TILE, IsCacheHint);
2296	default:
2297	llvm_unreachable("Invalid Dimension in tile mode for "
2298	"GetCpAsyncBulkTensorPrefetchOpcode.");
2299	}
2300	}
2301	}
2302
2303	static size_t GetDimsFromIntrinsic(unsigned IID) {
2304	switch (IID) {
2305	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2306	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2307	return `3`;
2308	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2309	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2310	return `4`;
2311	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2312	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2313	return `5`;
2314	default:
2315	llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
2316	}
2317	}
2318
2319	void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
2320	bool IsIm2Col) {
2321	// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2322	// {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
2323	// multicast, cache_hint,
2324	// multicast_flag, cache_hint_flag, cta_group_flag}
2325	// NumOperands = {Chain, IID} + {Actual intrinsic args}
2326	// = {2} + {8 + dims + im2col_offsets}
2327	size_t NumOps = N->getNumOperands();
2328	size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(IID: N->getConstantOperandVal(Num: `1`))
2329	: (NumOps - `10`);
2330	// Offsets is always 'NumDims - 2' and only for im2col mode
2331	size_t NumOffsets = IsIm2Col ? (NumDims - `2`) : `0`;
2332	bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - `2`) == `1`;
2333	bool IsMultiCast = N->getConstantOperandVal(Num: NumOps - `3`) == `1`;
2334	size_t NumBaseArgs = NumDims + NumOffsets + `3`; // for {dst, mbar, src}
2335	size_t MultiCastIdx = NumBaseArgs + `2`; // for Chain and IID
2336
2337	unsigned CTAGroupVal = N->getConstantOperandVal(Num: NumOps - `1`);
2338	if ((CTAGroupVal > `0`) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
2339	report_fatal_error(
2340	reason: formatv(Fmt: "CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2341	Vals: Subtarget->getSmVersion()));
2342
2343	SDLoc DL(N);
2344	SmallVector<SDValue, `8`> Ops(N->ops().slice(N: `2`, M: NumBaseArgs));
2345
2346	// Push MultiCast operand, if available
2347	if (IsMultiCast)
2348	Ops.push_back(Elt: N->getOperand(Num: MultiCastIdx));
2349
2350	// Push CacheHint operand, if available
2351	if (IsCacheHint)
2352	Ops.push_back(Elt: N->getOperand(Num: MultiCastIdx + `1`));
2353
2354	// Flag for CTA Group
2355	Ops.push_back(Elt: getI32Imm(Imm: CTAGroupVal, DL));
2356
2357	// Finally, the chain operand
2358	Ops.push_back(Elt: N->getOperand(Num: `0`));
2359
2360	bool IsShared32 =
2361	CurDAG->getDataLayout().getPointerSizeInBits(AS: ADDRESS_SPACE_SHARED) == `32`;
2362	unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
2363	Dim: NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2364	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
2365	}
2366
2367	void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
2368	bool IsIm2Col) {
2369	// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2370	// src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2371	// NumOperands = {Chain, IID} + {Actual intrinsic args}
2372	// = {2} + {4 + dims}
2373	size_t NumOps = N->getNumOperands();
2374	size_t NumDims = NumOps - `6`;
2375	bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - `1`) == `1`;
2376	size_t NumArgs = NumDims + (IsCacheHint ? `3` : `2`); // src, dst, cache_hint
2377
2378	SDLoc DL(N);
2379	SmallVector<SDValue, `8`> Ops(N->ops().slice(N: `2`, M: NumArgs));
2380	Ops.push_back(Elt: N->getOperand(Num: `0`)); // Chain operand
2381
2382	bool IsShared32 =
2383	CurDAG->getDataLayout().getPointerSizeInBits(AS: ADDRESS_SPACE_SHARED) == `32`;
2384	unsigned Opcode =
2385	GetCpAsyncBulkTensorS2GOpcode(Dim: NumDims, IsShared32, IsCacheHint, IsIm2Col);
2386	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
2387	}
2388
2389	void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
2390	bool IsIm2Col) {
2391	// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2392	// {src, dims{d0...dN}, im2col_offsets{dims-2}
2393	// cache_hint, cache_hint_flag}
2394	// NumOperands = {Chain, IID} + {Actual intrinsic args}
2395	// = {2} + {3 + dims + im2col_offsets}
2396	size_t NumOps = N->getNumOperands();
2397	size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(IID: N->getConstantOperandVal(Num: `1`))
2398	: (NumOps - `5`);
2399	// Offsets is always 'NumDims - 2' and only for im2col mode
2400	size_t NumOffsets = IsIm2Col ? (NumDims - `2`) : `0`;
2401	bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - `1`) == `1`;
2402	size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? `2` : `1`);
2403
2404	SDLoc DL(N);
2405	SmallVector<SDValue, `12`> Ops(N->ops().slice(N: `2`, M: NumArgs));
2406	Ops.push_back(Elt: N->getOperand(Num: `0`)); // Chain operand
2407
2408	unsigned Opcode =
2409	GetCpAsyncBulkTensorPrefetchOpcode(Dim: NumDims, IsCacheHint, IsIm2Col);
2410	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
2411	}
2412
2413	void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
2414	unsigned RedOp,
2415	bool IsIm2Col) {
2416	// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2417	// src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2418	// NumOperands = {Chain, IID} + {Actual intrinsic args}
2419	// = {2} + {4 + dims}
2420	size_t NumOps = N->getNumOperands();
2421	size_t NumDims = NumOps - `6`;
2422	bool IsCacheHint = N->getConstantOperandVal(Num: NumOps - `1`) == `1`;
2423	size_t NumArgs = NumDims + (IsCacheHint ? `3` : `2`); // src, dst, cache_hint
2424
2425	SDLoc DL(N);
2426	SmallVector<SDValue, `12`> Ops(N->ops().slice(N: `2`, M: NumArgs));
2427	Ops.push_back(Elt: getI32Imm(Imm: RedOp, DL)); // Reduction Op
2428	Ops.push_back(Elt: N->getOperand(Num: `0`)); // Chain operand
2429
2430	bool IsShared32 =
2431	CurDAG->getDataLayout().getPointerSizeInBits(AS: ADDRESS_SPACE_SHARED) == `32`;
2432	unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode(
2433	Dim: NumDims, IsShared32, IsCacheHint, IsIm2Col, /IsReduce=/true);
2434	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode, dl: DL, VTs: N->getVTList(), Ops));
2435	}
2436
2437	#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
2438	(enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
2439	: NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
2440
2441	static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
2442	switch (IID) {
2443	case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2444	return TCGEN05_ST_OPCODE(`16x64b`, x1);
2445	case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2446	return TCGEN05_ST_OPCODE(`16x64b`, x2);
2447	case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2448	return TCGEN05_ST_OPCODE(`16x64b`, x4);
2449	case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2450	return TCGEN05_ST_OPCODE(`16x64b`, x8);
2451	case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2452	return TCGEN05_ST_OPCODE(`16x64b`, x16);
2453	case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2454	return TCGEN05_ST_OPCODE(`16x64b`, x32);
2455	case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2456	return TCGEN05_ST_OPCODE(`16x64b`, x64);
2457	case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2458	return TCGEN05_ST_OPCODE(`16x64b`, x128);
2459	case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2460	return TCGEN05_ST_OPCODE(`16x128b`, x1);
2461	case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2462	return TCGEN05_ST_OPCODE(`16x128b`, x2);
2463	case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2464	return TCGEN05_ST_OPCODE(`16x128b`, x4);
2465	case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2466	return TCGEN05_ST_OPCODE(`16x128b`, x8);
2467	case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2468	return TCGEN05_ST_OPCODE(`16x128b`, x16);
2469	case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2470	return TCGEN05_ST_OPCODE(`16x128b`, x32);
2471	case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2472	return TCGEN05_ST_OPCODE(`16x128b`, x64);
2473	case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2474	return TCGEN05_ST_OPCODE(`16x256b`, x1);
2475	case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2476	return TCGEN05_ST_OPCODE(`16x256b`, x2);
2477	case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2478	return TCGEN05_ST_OPCODE(`16x256b`, x4);
2479	case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2480	return TCGEN05_ST_OPCODE(`16x256b`, x8);
2481	case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2482	return TCGEN05_ST_OPCODE(`16x256b`, x16);
2483	case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2484	return TCGEN05_ST_OPCODE(`16x256b`, x32);
2485	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2486	return TCGEN05_ST_OPCODE(`16x32bx2`, x1);
2487	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2488	return TCGEN05_ST_OPCODE(`16x32bx2`, x2);
2489	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2490	return TCGEN05_ST_OPCODE(`16x32bx2`, x4);
2491	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2492	return TCGEN05_ST_OPCODE(`16x32bx2`, x8);
2493	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2494	return TCGEN05_ST_OPCODE(`16x32bx2`, x16);
2495	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2496	return TCGEN05_ST_OPCODE(`16x32bx2`, x32);
2497	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2498	return TCGEN05_ST_OPCODE(`16x32bx2`, x64);
2499	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2500	return TCGEN05_ST_OPCODE(`16x32bx2`, x128);
2501	case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2502	return TCGEN05_ST_OPCODE(`32x32b`, x1);
2503	case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2504	return TCGEN05_ST_OPCODE(`32x32b`, x2);
2505	case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2506	return TCGEN05_ST_OPCODE(`32x32b`, x4);
2507	case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2508	return TCGEN05_ST_OPCODE(`32x32b`, x8);
2509	case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2510	return TCGEN05_ST_OPCODE(`32x32b`, x16);
2511	case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2512	return TCGEN05_ST_OPCODE(`32x32b`, x32);
2513	case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2514	return TCGEN05_ST_OPCODE(`32x32b`, x64);
2515	case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2516	return TCGEN05_ST_OPCODE(`32x32b`, x128);
2517	}
2518	llvm_unreachable("unhandled tcgen05.st lowering");
2519	}
2520
2521	void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode N, bool* hasOffset) {
2522	SDLoc DL(N);
2523	unsigned IID = cast<ConstantSDNode>(Val: N->getOperand(Num: `1`))->getZExtValue();
2524
2525	SmallVector<SDValue, `128`> Operands = {
2526	N->getOperand(Num: `2`) // taddr
2527	};
2528
2529	if (hasOffset)
2530	Operands.push_back(Elt: CurDAG->getTargetConstant(
2531	Val: cast<ConstantSDNode>(Val: N->getOperand(Num: `3`))->getZExtValue(), DL,
2532	VT: MVT::i32)); // Offset
2533
2534	for (unsigned I = hasOffset ? `4` : `3`; I < (N->getNumOperands() - `1`); I++)
2535	Operands.push_back(Elt: N->getOperand(Num: I));
2536
2537	bool enableUnpack =
2538	cast<ConstantSDNode>(Val: N->getOperand(Num: N->getNumOperands() - `1`))
2539	->getZExtValue();
2540
2541	Operands.push_back(Elt: N->getOperand(Num: `0`)); // Chain
2542	ReplaceNode(F: N, T: CurDAG->getMachineNode(Opcode: getTcgen05StOpcode(IID, enableUnpack),
2543	dl: DL, VTs: N->getVTList(), Ops: Operands));
2544	}
2545
2546	bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2547	unsigned IID = N->getConstantOperandVal(Num: `1`);
2548	using TMARedTy = llvm::nvvm::TMAReductionOp;
2549	auto CastTy = [](TMARedTy Op) { return static_cast<unsigned>(Op); };
2550	switch (IID) {
2551	default:
2552	return false;
2553	case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
2554	case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
2555	case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
2556	case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
2557	case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
2558	SelectCpAsyncBulkTensorS2GCommon(N);
2559	return true;
2560	case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
2561	case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
2562	case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
2563	SelectCpAsyncBulkTensorS2GCommon(N, /IsIm2Col=/true);
2564	return true;
2565	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2566	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2567	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2568	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2569	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2570	SelectCpAsyncBulkTensorG2SCommon(N);
2571	return true;
2572	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2573	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2574	case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2575	SelectCpAsyncBulkTensorG2SCommon(N, /IsIm2Col=/true);
2576	return true;
2577	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
2578	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
2579	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
2580	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
2581	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
2582	SelectCpAsyncBulkTensorPrefetchCommon(N);
2583	return true;
2584	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2585	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2586	case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2587	SelectCpAsyncBulkTensorPrefetchCommon(N, /IsIm2Col=/true);
2588	return true;
2589	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2590	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2591	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
2592	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d:
2593	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d:
2594	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::ADD));
2595	return true;
2596	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d:
2597	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d:
2598	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d:
2599	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::ADD),
2600	/IsIm2Col=/true);
2601	return true;
2602	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d:
2603	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d:
2604	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d:
2605	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d:
2606	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d:
2607	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::MIN));
2608	return true;
2609	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d:
2610	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d:
2611	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d:
2612	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::MIN),
2613	/IsIm2Col=/true);
2614	return true;
2615	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d:
2616	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d:
2617	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d:
2618	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d:
2619	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d:
2620	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::MAX));
2621	return true;
2622	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d:
2623	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d:
2624	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d:
2625	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::MAX),
2626	/IsIm2Col=/true);
2627	return true;
2628	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d:
2629	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d:
2630	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d:
2631	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d:
2632	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d:
2633	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::INC));
2634	return true;
2635	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d:
2636	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d:
2637	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d:
2638	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::INC),
2639	/IsIm2Col=/true);
2640	return true;
2641	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d:
2642	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d:
2643	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d:
2644	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d:
2645	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d:
2646	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::DEC));
2647	return true;
2648	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d:
2649	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d:
2650	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d:
2651	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::DEC),
2652	/IsIm2Col=/true);
2653	return true;
2654	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d:
2655	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d:
2656	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d:
2657	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d:
2658	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d:
2659	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::AND));
2660	return true;
2661	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d:
2662	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d:
2663	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d:
2664	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::AND),
2665	/IsIm2Col=/true);
2666	return true;
2667	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d:
2668	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d:
2669	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d:
2670	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d:
2671	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d:
2672	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::OR));
2673	return true;
2674	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d:
2675	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d:
2676	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d:
2677	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::OR),
2678	/IsIm2Col=/true);
2679	return true;
2680	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d:
2681	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d:
2682	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d:
2683	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d:
2684	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d:
2685	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::XOR));
2686	return true;
2687	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d:
2688	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d:
2689	case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d:
2690	SelectCpAsyncBulkTensorReduceCommon(N, RedOp: CastTy (TMARedTy::XOR),
2691	/IsIm2Col=/true);
2692	return true;
2693
2694	case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
2695	case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2696	case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2697	case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2698	case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2699	case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2700	case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2701	case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2702	case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
2703	case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2704	case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2705	case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2706	case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2707	case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2708	case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2709	case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2710	case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2711	case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2712	case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2713	case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2714	case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2715	case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2716	case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2717	case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2718	case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2719	case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2720	case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2721	case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2722	case Intrinsic::nvvm_tcgen05_st_16x256b_x32: {
2723	SelectTcgen05St(N);
2724	return true;
2725	}
2726
2727	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1:
2728	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2729	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2730	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2731	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2732	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2733	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2734	case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
2735	SelectTcgen05St(N, / hasOffset / true);
2736	return true;
2737	}
2738	}
2739	}
2740

Browse the source code of llvm_projects/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp