1//===-------- NVPTX.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "clang/Basic/TargetBuiltins.h"
15#include "llvm/IR/IntrinsicsNVPTX.h"
16
17using namespace clang;
18using namespace CodeGen;
19using namespace llvm;
20
21namespace {
22// Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
23struct NVPTXMmaLdstInfo {
24 unsigned NumResults; // Number of elements to load/store
25 // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
26 unsigned IID_col;
27 unsigned IID_row;
28};
29
30#define MMA_INTR(geom_op_type, layout) \
31 Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
32#define MMA_LDST(n, geom_op_type) \
33 { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
34
35static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
36 switch (BuiltinID) {
37 // FP MMA loads
38 case NVPTX::BI__hmma_m16n16k16_ld_a:
39 return MMA_LDST(8, m16n16k16_load_a_f16);
40 case NVPTX::BI__hmma_m16n16k16_ld_b:
41 return MMA_LDST(8, m16n16k16_load_b_f16);
42 case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
43 return MMA_LDST(4, m16n16k16_load_c_f16);
44 case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
45 return MMA_LDST(8, m16n16k16_load_c_f32);
46 case NVPTX::BI__hmma_m32n8k16_ld_a:
47 return MMA_LDST(8, m32n8k16_load_a_f16);
48 case NVPTX::BI__hmma_m32n8k16_ld_b:
49 return MMA_LDST(8, m32n8k16_load_b_f16);
50 case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
51 return MMA_LDST(4, m32n8k16_load_c_f16);
52 case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
53 return MMA_LDST(8, m32n8k16_load_c_f32);
54 case NVPTX::BI__hmma_m8n32k16_ld_a:
55 return MMA_LDST(8, m8n32k16_load_a_f16);
56 case NVPTX::BI__hmma_m8n32k16_ld_b:
57 return MMA_LDST(8, m8n32k16_load_b_f16);
58 case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
59 return MMA_LDST(4, m8n32k16_load_c_f16);
60 case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
61 return MMA_LDST(8, m8n32k16_load_c_f32);
62
63 // Integer MMA loads
64 case NVPTX::BI__imma_m16n16k16_ld_a_s8:
65 return MMA_LDST(2, m16n16k16_load_a_s8);
66 case NVPTX::BI__imma_m16n16k16_ld_a_u8:
67 return MMA_LDST(2, m16n16k16_load_a_u8);
68 case NVPTX::BI__imma_m16n16k16_ld_b_s8:
69 return MMA_LDST(2, m16n16k16_load_b_s8);
70 case NVPTX::BI__imma_m16n16k16_ld_b_u8:
71 return MMA_LDST(2, m16n16k16_load_b_u8);
72 case NVPTX::BI__imma_m16n16k16_ld_c:
73 return MMA_LDST(8, m16n16k16_load_c_s32);
74 case NVPTX::BI__imma_m32n8k16_ld_a_s8:
75 return MMA_LDST(4, m32n8k16_load_a_s8);
76 case NVPTX::BI__imma_m32n8k16_ld_a_u8:
77 return MMA_LDST(4, m32n8k16_load_a_u8);
78 case NVPTX::BI__imma_m32n8k16_ld_b_s8:
79 return MMA_LDST(1, m32n8k16_load_b_s8);
80 case NVPTX::BI__imma_m32n8k16_ld_b_u8:
81 return MMA_LDST(1, m32n8k16_load_b_u8);
82 case NVPTX::BI__imma_m32n8k16_ld_c:
83 return MMA_LDST(8, m32n8k16_load_c_s32);
84 case NVPTX::BI__imma_m8n32k16_ld_a_s8:
85 return MMA_LDST(1, m8n32k16_load_a_s8);
86 case NVPTX::BI__imma_m8n32k16_ld_a_u8:
87 return MMA_LDST(1, m8n32k16_load_a_u8);
88 case NVPTX::BI__imma_m8n32k16_ld_b_s8:
89 return MMA_LDST(4, m8n32k16_load_b_s8);
90 case NVPTX::BI__imma_m8n32k16_ld_b_u8:
91 return MMA_LDST(4, m8n32k16_load_b_u8);
92 case NVPTX::BI__imma_m8n32k16_ld_c:
93 return MMA_LDST(8, m8n32k16_load_c_s32);
94
95 // Sub-integer MMA loads.
96 // Only row/col layout is supported by A/B fragments.
97 case NVPTX::BI__imma_m8n8k32_ld_a_s4:
98 return {.NumResults: 1, .IID_col: 0, MMA_INTR(m8n8k32_load_a_s4, row)};
99 case NVPTX::BI__imma_m8n8k32_ld_a_u4:
100 return {.NumResults: 1, .IID_col: 0, MMA_INTR(m8n8k32_load_a_u4, row)};
101 case NVPTX::BI__imma_m8n8k32_ld_b_s4:
102 return {.NumResults: 1, MMA_INTR(m8n8k32_load_b_s4, col), .IID_row: 0};
103 case NVPTX::BI__imma_m8n8k32_ld_b_u4:
104 return {.NumResults: 1, MMA_INTR(m8n8k32_load_b_u4, col), .IID_row: 0};
105 case NVPTX::BI__imma_m8n8k32_ld_c:
106 return MMA_LDST(2, m8n8k32_load_c_s32);
107 case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
108 return {.NumResults: 1, .IID_col: 0, MMA_INTR(m8n8k128_load_a_b1, row)};
109 case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
110 return {.NumResults: 1, MMA_INTR(m8n8k128_load_b_b1, col), .IID_row: 0};
111 case NVPTX::BI__bmma_m8n8k128_ld_c:
112 return MMA_LDST(2, m8n8k128_load_c_s32);
113
114 // Double MMA loads
115 case NVPTX::BI__dmma_m8n8k4_ld_a:
116 return MMA_LDST(1, m8n8k4_load_a_f64);
117 case NVPTX::BI__dmma_m8n8k4_ld_b:
118 return MMA_LDST(1, m8n8k4_load_b_f64);
119 case NVPTX::BI__dmma_m8n8k4_ld_c:
120 return MMA_LDST(2, m8n8k4_load_c_f64);
121
122 // Alternate float MMA loads
123 case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
124 return MMA_LDST(4, m16n16k16_load_a_bf16);
125 case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
126 return MMA_LDST(4, m16n16k16_load_b_bf16);
127 case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
128 return MMA_LDST(2, m8n32k16_load_a_bf16);
129 case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
130 return MMA_LDST(8, m8n32k16_load_b_bf16);
131 case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
132 return MMA_LDST(8, m32n8k16_load_a_bf16);
133 case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
134 return MMA_LDST(2, m32n8k16_load_b_bf16);
135 case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
136 return MMA_LDST(4, m16n16k8_load_a_tf32);
137 case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
138 return MMA_LDST(4, m16n16k8_load_b_tf32);
139 case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
140 return MMA_LDST(8, m16n16k8_load_c_f32);
141
142 // NOTE: We need to follow inconsitent naming scheme used by NVCC. Unlike
143 // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
144 // use fragment C for both loads and stores.
145 // FP MMA stores.
146 case NVPTX::BI__hmma_m16n16k16_st_c_f16:
147 return MMA_LDST(4, m16n16k16_store_d_f16);
148 case NVPTX::BI__hmma_m16n16k16_st_c_f32:
149 return MMA_LDST(8, m16n16k16_store_d_f32);
150 case NVPTX::BI__hmma_m32n8k16_st_c_f16:
151 return MMA_LDST(4, m32n8k16_store_d_f16);
152 case NVPTX::BI__hmma_m32n8k16_st_c_f32:
153 return MMA_LDST(8, m32n8k16_store_d_f32);
154 case NVPTX::BI__hmma_m8n32k16_st_c_f16:
155 return MMA_LDST(4, m8n32k16_store_d_f16);
156 case NVPTX::BI__hmma_m8n32k16_st_c_f32:
157 return MMA_LDST(8, m8n32k16_store_d_f32);
158
159 // Integer and sub-integer MMA stores.
160 // Another naming quirk. Unlike other MMA builtins that use PTX types in the
161 // name, integer loads/stores use LLVM's i32.
162 case NVPTX::BI__imma_m16n16k16_st_c_i32:
163 return MMA_LDST(8, m16n16k16_store_d_s32);
164 case NVPTX::BI__imma_m32n8k16_st_c_i32:
165 return MMA_LDST(8, m32n8k16_store_d_s32);
166 case NVPTX::BI__imma_m8n32k16_st_c_i32:
167 return MMA_LDST(8, m8n32k16_store_d_s32);
168 case NVPTX::BI__imma_m8n8k32_st_c_i32:
169 return MMA_LDST(2, m8n8k32_store_d_s32);
170 case NVPTX::BI__bmma_m8n8k128_st_c_i32:
171 return MMA_LDST(2, m8n8k128_store_d_s32);
172
173 // Double MMA store
174 case NVPTX::BI__dmma_m8n8k4_st_c_f64:
175 return MMA_LDST(2, m8n8k4_store_d_f64);
176
177 // Alternate float MMA store
178 case NVPTX::BI__mma_m16n16k8_st_c_f32:
179 return MMA_LDST(8, m16n16k8_store_d_f32);
180
181 default:
182 llvm_unreachable("Unknown MMA builtin");
183 }
184}
185#undef MMA_LDST
186#undef MMA_INTR
187
188
189struct NVPTXMmaInfo {
190 unsigned NumEltsA;
191 unsigned NumEltsB;
192 unsigned NumEltsC;
193 unsigned NumEltsD;
194
195 // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority
196 // over 'col' for layout. The index of non-satf variants is expected to match
197 // the undocumented layout constants used by CUDA's mma.hpp.
198 std::array<unsigned, 8> Variants;
199
200 unsigned getMMAIntrinsic(int Layout, bool Satf) {
201 unsigned Index = Layout + 4 * Satf;
202 if (Index >= Variants.size())
203 return 0;
204 return Variants[Index];
205 }
206};
207
208 // Returns an intrinsic that matches Layout and Satf for valid combinations of
209 // Layout and Satf, 0 otherwise.
210static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
211 // clang-format off
212#define MMA_VARIANTS(geom, type) \
213 Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \
214 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
215 Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \
216 Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type
217#define MMA_SATF_VARIANTS(geom, type) \
218 MMA_VARIANTS(geom, type), \
219 Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
220 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
221 Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
222 Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite
223// Sub-integer MMA only supports row.col layout.
224#define MMA_VARIANTS_I4(geom, type) \
225 0, \
226 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
227 0, \
228 0, \
229 0, \
230 Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
231 0, \
232 0
233// b1 MMA does not support .satfinite.
234#define MMA_VARIANTS_B1_XOR(geom, type) \
235 0, \
236 Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type, \
237 0, \
238 0, \
239 0, \
240 0, \
241 0, \
242 0
243#define MMA_VARIANTS_B1_AND(geom, type) \
244 0, \
245 Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type, \
246 0, \
247 0, \
248 0, \
249 0, \
250 0, \
251 0
252 // clang-format on
253 switch (BuiltinID) {
254 // FP MMA
255 // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while
256 // NumEltsN of return value are ordered as A,B,C,D.
257 case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
258 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}};
259 case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
260 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}};
261 case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
262 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}};
263 case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
264 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}};
265 case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
266 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}};
267 case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
268 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}};
269 case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
270 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}};
271 case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
272 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}};
273 case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
274 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}};
275 case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
276 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}};
277 case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
278 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}};
279 case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
280 return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}};
281
282 // Integer MMA
283 case NVPTX::BI__imma_m16n16k16_mma_s8:
284 return {.NumEltsA: 2, .NumEltsB: 2, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, s8)}}};
285 case NVPTX::BI__imma_m16n16k16_mma_u8:
286 return {.NumEltsA: 2, .NumEltsB: 2, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, u8)}}};
287 case NVPTX::BI__imma_m32n8k16_mma_s8:
288 return {.NumEltsA: 4, .NumEltsB: 1, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, s8)}}};
289 case NVPTX::BI__imma_m32n8k16_mma_u8:
290 return {.NumEltsA: 4, .NumEltsB: 1, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, u8)}}};
291 case NVPTX::BI__imma_m8n32k16_mma_s8:
292 return {.NumEltsA: 1, .NumEltsB: 4, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, s8)}}};
293 case NVPTX::BI__imma_m8n32k16_mma_u8:
294 return {.NumEltsA: 1, .NumEltsB: 4, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, u8)}}};
295
296 // Sub-integer MMA
297 case NVPTX::BI__imma_m8n8k32_mma_s4:
298 return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS_I4(m8n8k32, s4)}}};
299 case NVPTX::BI__imma_m8n8k32_mma_u4:
300 return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS_I4(m8n8k32, u4)}}};
301 case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
302 return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}};
303 case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
304 return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS_B1_AND(m8n8k128, b1)}}};
305
306 // Double MMA
307 case NVPTX::BI__dmma_m8n8k4_mma_f64:
308 return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS(m8n8k4, f64)}}};
309
310 // Alternate FP MMA
311 case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
312 return {.NumEltsA: 4, .NumEltsB: 4, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_VARIANTS(m16n16k16, bf16)}}};
313 case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
314 return {.NumEltsA: 2, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_VARIANTS(m8n32k16, bf16)}}};
315 case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
316 return {.NumEltsA: 8, .NumEltsB: 2, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_VARIANTS(m32n8k16, bf16)}}};
317 case NVPTX::BI__mma_tf32_m16n16k8_mma_f32:
318 return {.NumEltsA: 4, .NumEltsB: 4, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_VARIANTS(m16n16k8, tf32)}}};
319 default:
320 llvm_unreachable("Unexpected builtin ID.");
321 }
322#undef MMA_VARIANTS
323#undef MMA_SATF_VARIANTS
324#undef MMA_VARIANTS_I4
325#undef MMA_VARIANTS_B1_AND
326#undef MMA_VARIANTS_B1_XOR
327}
328
329static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
330 const CallExpr *E) {
331 Value *Ptr = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
332 QualType ArgType = E->getArg(Arg: 0)->getType();
333 clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(T: ArgType);
334 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ArgType->getPointeeType());
335 return CGF.Builder.CreateCall(
336 Callee: CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {ElemTy, Ptr->getType()}),
337 Args: {Ptr, ConstantInt::get(Ty: CGF.Builder.getInt32Ty(), V: Align.getQuantity())});
338}
339
340static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) {
341 Value *Ptr = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
342 QualType ArgType = E->getArg(Arg: 0)->getType();
343 clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(T: ArgType);
344 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ArgType->getPointeeType());
345
346 // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL
347 auto *ASC = CGF.Builder.CreateAddrSpaceCast(V: Ptr, DestTy: CGF.Builder.getPtrTy(AddrSpace: 1));
348 auto *LD = CGF.Builder.CreateAlignedLoad(Ty: ElemTy, Ptr: ASC, Align: AlignV.getAsAlign());
349 MDNode *MD = MDNode::get(Context&: CGF.Builder.getContext(), MDs: {});
350 LD->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
351
352 return LD;
353}
354
355static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
356 const CallExpr *E) {
357 Value *Ptr = CGF.EmitScalarExpr(E: E->getArg(Arg: 0));
358 llvm::Type *ElemTy =
359 CGF.ConvertTypeForMem(T: E->getArg(Arg: 0)->getType()->getPointeeType());
360 return CGF.Builder.CreateCall(
361 Callee: CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {ElemTy, Ptr->getType()}),
362 Args: {Ptr, CGF.EmitScalarExpr(E: E->getArg(Arg: 1))});
363}
364
365static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
366 CodeGenFunction &CGF, const CallExpr *E,
367 int SrcSize) {
368 return E->getNumArgs() == 3
369 ? CGF.Builder.CreateCall(Callee: CGF.CGM.getIntrinsic(IID: IntrinsicIDS),
370 Args: {CGF.EmitScalarExpr(E: E->getArg(Arg: 0)),
371 CGF.EmitScalarExpr(E: E->getArg(Arg: 1)),
372 CGF.EmitScalarExpr(E: E->getArg(Arg: 2))})
373 : CGF.Builder.CreateCall(Callee: CGF.CGM.getIntrinsic(IID: IntrinsicID),
374 Args: {CGF.EmitScalarExpr(E: E->getArg(Arg: 0)),
375 CGF.EmitScalarExpr(E: E->getArg(Arg: 1))});
376}
377
378static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
379 const CallExpr *E, CodeGenFunction &CGF) {
380 auto &C = CGF.CGM.getContext();
381 if (!(C.getLangOpts().NativeHalfType ||
382 !C.getTargetInfo().useFP16ConversionIntrinsics())) {
383 CGF.CGM.Error(loc: E->getExprLoc(), error: C.BuiltinInfo.getQuotedName(ID: BuiltinID) +
384 " requires native half type support.");
385 return nullptr;
386 }
387
388 if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2)
389 return MakeLdg(CGF, E);
390
391 if (IntrinsicID == Intrinsic::nvvm_ldu_global_f)
392 return MakeLdu(IntrinsicID, CGF, E);
393
394 SmallVector<Value *, 16> Args;
395 auto *F = CGF.CGM.getIntrinsic(IID: IntrinsicID);
396 auto *FTy = F->getFunctionType();
397 unsigned ICEArguments = 0;
398 ASTContext::GetBuiltinTypeError Error;
399 C.GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
400 assert(Error == ASTContext::GE_None && "Should not codegen an error");
401 for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
402 assert((ICEArguments & (1 << i)) == 0);
403 auto *ArgValue = CGF.EmitScalarExpr(E: E->getArg(Arg: i));
404 auto *PTy = FTy->getParamType(i);
405 if (PTy != ArgValue->getType())
406 ArgValue = CGF.Builder.CreateBitCast(V: ArgValue, DestTy: PTy);
407 Args.push_back(Elt: ArgValue);
408 }
409
410 return CGF.Builder.CreateCall(Callee: F, Args);
411}
412} // namespace
413
414Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
415 const CallExpr *E) {
416 switch (BuiltinID) {
417 case NVPTX::BI__nvvm_atom_add_gen_i:
418 case NVPTX::BI__nvvm_atom_add_gen_l:
419 case NVPTX::BI__nvvm_atom_add_gen_ll:
420 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Add, E);
421
422 case NVPTX::BI__nvvm_atom_sub_gen_i:
423 case NVPTX::BI__nvvm_atom_sub_gen_l:
424 case NVPTX::BI__nvvm_atom_sub_gen_ll:
425 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Sub, E);
426
427 case NVPTX::BI__nvvm_atom_and_gen_i:
428 case NVPTX::BI__nvvm_atom_and_gen_l:
429 case NVPTX::BI__nvvm_atom_and_gen_ll:
430 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::And, E);
431
432 case NVPTX::BI__nvvm_atom_or_gen_i:
433 case NVPTX::BI__nvvm_atom_or_gen_l:
434 case NVPTX::BI__nvvm_atom_or_gen_ll:
435 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Or, E);
436
437 case NVPTX::BI__nvvm_atom_xor_gen_i:
438 case NVPTX::BI__nvvm_atom_xor_gen_l:
439 case NVPTX::BI__nvvm_atom_xor_gen_ll:
440 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Xor, E);
441
442 case NVPTX::BI__nvvm_atom_xchg_gen_i:
443 case NVPTX::BI__nvvm_atom_xchg_gen_l:
444 case NVPTX::BI__nvvm_atom_xchg_gen_ll:
445 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Xchg, E);
446
447 case NVPTX::BI__nvvm_atom_max_gen_i:
448 case NVPTX::BI__nvvm_atom_max_gen_l:
449 case NVPTX::BI__nvvm_atom_max_gen_ll:
450 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Max, E);
451
452 case NVPTX::BI__nvvm_atom_max_gen_ui:
453 case NVPTX::BI__nvvm_atom_max_gen_ul:
454 case NVPTX::BI__nvvm_atom_max_gen_ull:
455 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::UMax, E);
456
457 case NVPTX::BI__nvvm_atom_min_gen_i:
458 case NVPTX::BI__nvvm_atom_min_gen_l:
459 case NVPTX::BI__nvvm_atom_min_gen_ll:
460 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Min, E);
461
462 case NVPTX::BI__nvvm_atom_min_gen_ui:
463 case NVPTX::BI__nvvm_atom_min_gen_ul:
464 case NVPTX::BI__nvvm_atom_min_gen_ull:
465 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::UMin, E);
466
467 case NVPTX::BI__nvvm_atom_cas_gen_us:
468 case NVPTX::BI__nvvm_atom_cas_gen_i:
469 case NVPTX::BI__nvvm_atom_cas_gen_l:
470 case NVPTX::BI__nvvm_atom_cas_gen_ll:
471 // __nvvm_atom_cas_gen_* should return the old value rather than the
472 // success flag.
473 return MakeAtomicCmpXchgValue(CGF&: *this, E, /*ReturnBool=*/false);
474
475 case NVPTX::BI__nvvm_atom_add_gen_f:
476 case NVPTX::BI__nvvm_atom_add_gen_d: {
477 Address DestAddr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
478 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
479
480 return Builder.CreateAtomicRMW(Op: llvm::AtomicRMWInst::FAdd, Addr: DestAddr, Val,
481 Ordering: AtomicOrdering::SequentiallyConsistent);
482 }
483
484 case NVPTX::BI__nvvm_atom_inc_gen_ui:
485 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::UIncWrap, E);
486
487 case NVPTX::BI__nvvm_atom_dec_gen_ui:
488 return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::UDecWrap, E);
489
490 case NVPTX::BI__nvvm_ldg_c:
491 case NVPTX::BI__nvvm_ldg_sc:
492 case NVPTX::BI__nvvm_ldg_c2:
493 case NVPTX::BI__nvvm_ldg_sc2:
494 case NVPTX::BI__nvvm_ldg_c4:
495 case NVPTX::BI__nvvm_ldg_sc4:
496 case NVPTX::BI__nvvm_ldg_s:
497 case NVPTX::BI__nvvm_ldg_s2:
498 case NVPTX::BI__nvvm_ldg_s4:
499 case NVPTX::BI__nvvm_ldg_i:
500 case NVPTX::BI__nvvm_ldg_i2:
501 case NVPTX::BI__nvvm_ldg_i4:
502 case NVPTX::BI__nvvm_ldg_l:
503 case NVPTX::BI__nvvm_ldg_l2:
504 case NVPTX::BI__nvvm_ldg_ll:
505 case NVPTX::BI__nvvm_ldg_ll2:
506 case NVPTX::BI__nvvm_ldg_uc:
507 case NVPTX::BI__nvvm_ldg_uc2:
508 case NVPTX::BI__nvvm_ldg_uc4:
509 case NVPTX::BI__nvvm_ldg_us:
510 case NVPTX::BI__nvvm_ldg_us2:
511 case NVPTX::BI__nvvm_ldg_us4:
512 case NVPTX::BI__nvvm_ldg_ui:
513 case NVPTX::BI__nvvm_ldg_ui2:
514 case NVPTX::BI__nvvm_ldg_ui4:
515 case NVPTX::BI__nvvm_ldg_ul:
516 case NVPTX::BI__nvvm_ldg_ul2:
517 case NVPTX::BI__nvvm_ldg_ull:
518 case NVPTX::BI__nvvm_ldg_ull2:
519 case NVPTX::BI__nvvm_ldg_f:
520 case NVPTX::BI__nvvm_ldg_f2:
521 case NVPTX::BI__nvvm_ldg_f4:
522 case NVPTX::BI__nvvm_ldg_d:
523 case NVPTX::BI__nvvm_ldg_d2:
524 // PTX Interoperability section 2.2: "For a vector with an even number of
525 // elements, its alignment is set to number of elements times the alignment
526 // of its member: n*alignof(t)."
527 return MakeLdg(CGF&: *this, E);
528
529 case NVPTX::BI__nvvm_ldu_c:
530 case NVPTX::BI__nvvm_ldu_sc:
531 case NVPTX::BI__nvvm_ldu_c2:
532 case NVPTX::BI__nvvm_ldu_sc2:
533 case NVPTX::BI__nvvm_ldu_c4:
534 case NVPTX::BI__nvvm_ldu_sc4:
535 case NVPTX::BI__nvvm_ldu_s:
536 case NVPTX::BI__nvvm_ldu_s2:
537 case NVPTX::BI__nvvm_ldu_s4:
538 case NVPTX::BI__nvvm_ldu_i:
539 case NVPTX::BI__nvvm_ldu_i2:
540 case NVPTX::BI__nvvm_ldu_i4:
541 case NVPTX::BI__nvvm_ldu_l:
542 case NVPTX::BI__nvvm_ldu_l2:
543 case NVPTX::BI__nvvm_ldu_ll:
544 case NVPTX::BI__nvvm_ldu_ll2:
545 case NVPTX::BI__nvvm_ldu_uc:
546 case NVPTX::BI__nvvm_ldu_uc2:
547 case NVPTX::BI__nvvm_ldu_uc4:
548 case NVPTX::BI__nvvm_ldu_us:
549 case NVPTX::BI__nvvm_ldu_us2:
550 case NVPTX::BI__nvvm_ldu_us4:
551 case NVPTX::BI__nvvm_ldu_ui:
552 case NVPTX::BI__nvvm_ldu_ui2:
553 case NVPTX::BI__nvvm_ldu_ui4:
554 case NVPTX::BI__nvvm_ldu_ul:
555 case NVPTX::BI__nvvm_ldu_ul2:
556 case NVPTX::BI__nvvm_ldu_ull:
557 case NVPTX::BI__nvvm_ldu_ull2:
558 return MakeLdu(IntrinsicID: Intrinsic::nvvm_ldu_global_i, CGF&: *this, E);
559 case NVPTX::BI__nvvm_ldu_f:
560 case NVPTX::BI__nvvm_ldu_f2:
561 case NVPTX::BI__nvvm_ldu_f4:
562 case NVPTX::BI__nvvm_ldu_d:
563 case NVPTX::BI__nvvm_ldu_d2:
564 return MakeLdu(IntrinsicID: Intrinsic::nvvm_ldu_global_f, CGF&: *this, E);
565
566 case NVPTX::BI__nvvm_atom_cta_add_gen_i:
567 case NVPTX::BI__nvvm_atom_cta_add_gen_l:
568 case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
569 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_add_gen_i_cta, CGF&: *this, E);
570 case NVPTX::BI__nvvm_atom_sys_add_gen_i:
571 case NVPTX::BI__nvvm_atom_sys_add_gen_l:
572 case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
573 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_add_gen_i_sys, CGF&: *this, E);
574 case NVPTX::BI__nvvm_atom_cta_add_gen_f:
575 case NVPTX::BI__nvvm_atom_cta_add_gen_d:
576 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_add_gen_f_cta, CGF&: *this, E);
577 case NVPTX::BI__nvvm_atom_sys_add_gen_f:
578 case NVPTX::BI__nvvm_atom_sys_add_gen_d:
579 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_add_gen_f_sys, CGF&: *this, E);
580 case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
581 case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
582 case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
583 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_exch_gen_i_cta, CGF&: *this, E);
584 case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
585 case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
586 case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
587 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_exch_gen_i_sys, CGF&: *this, E);
588 case NVPTX::BI__nvvm_atom_cta_max_gen_i:
589 case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
590 case NVPTX::BI__nvvm_atom_cta_max_gen_l:
591 case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
592 case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
593 case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
594 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_max_gen_i_cta, CGF&: *this, E);
595 case NVPTX::BI__nvvm_atom_sys_max_gen_i:
596 case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
597 case NVPTX::BI__nvvm_atom_sys_max_gen_l:
598 case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
599 case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
600 case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
601 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_max_gen_i_sys, CGF&: *this, E);
602 case NVPTX::BI__nvvm_atom_cta_min_gen_i:
603 case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
604 case NVPTX::BI__nvvm_atom_cta_min_gen_l:
605 case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
606 case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
607 case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
608 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_min_gen_i_cta, CGF&: *this, E);
609 case NVPTX::BI__nvvm_atom_sys_min_gen_i:
610 case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
611 case NVPTX::BI__nvvm_atom_sys_min_gen_l:
612 case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
613 case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
614 case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
615 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_min_gen_i_sys, CGF&: *this, E);
616 case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
617 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_inc_gen_i_cta, CGF&: *this, E);
618 case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
619 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_dec_gen_i_cta, CGF&: *this, E);
620 case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
621 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_inc_gen_i_sys, CGF&: *this, E);
622 case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
623 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_dec_gen_i_sys, CGF&: *this, E);
624 case NVPTX::BI__nvvm_atom_cta_and_gen_i:
625 case NVPTX::BI__nvvm_atom_cta_and_gen_l:
626 case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
627 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_and_gen_i_cta, CGF&: *this, E);
628 case NVPTX::BI__nvvm_atom_sys_and_gen_i:
629 case NVPTX::BI__nvvm_atom_sys_and_gen_l:
630 case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
631 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_and_gen_i_sys, CGF&: *this, E);
632 case NVPTX::BI__nvvm_atom_cta_or_gen_i:
633 case NVPTX::BI__nvvm_atom_cta_or_gen_l:
634 case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
635 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_or_gen_i_cta, CGF&: *this, E);
636 case NVPTX::BI__nvvm_atom_sys_or_gen_i:
637 case NVPTX::BI__nvvm_atom_sys_or_gen_l:
638 case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
639 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_or_gen_i_sys, CGF&: *this, E);
640 case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
641 case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
642 case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
643 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_xor_gen_i_cta, CGF&: *this, E);
644 case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
645 case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
646 case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
647 return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_xor_gen_i_sys, CGF&: *this, E);
648 case NVPTX::BI__nvvm_atom_cta_cas_gen_us:
649 case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
650 case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
651 case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
652 Value *Ptr = EmitScalarExpr(E: E->getArg(Arg: 0));
653 llvm::Type *ElemTy =
654 ConvertTypeForMem(T: E->getArg(Arg: 0)->getType()->getPointeeType());
655 return Builder.CreateCall(
656 Callee: CGM.getIntrinsic(
657 IID: Intrinsic::nvvm_atomic_cas_gen_i_cta, Tys: {ElemTy, Ptr->getType()}),
658 Args: {Ptr, EmitScalarExpr(E: E->getArg(Arg: 1)), EmitScalarExpr(E: E->getArg(Arg: 2))});
659 }
660 case NVPTX::BI__nvvm_atom_sys_cas_gen_us:
661 case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
662 case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
663 case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
664 Value *Ptr = EmitScalarExpr(E: E->getArg(Arg: 0));
665 llvm::Type *ElemTy =
666 ConvertTypeForMem(T: E->getArg(Arg: 0)->getType()->getPointeeType());
667 return Builder.CreateCall(
668 Callee: CGM.getIntrinsic(
669 IID: Intrinsic::nvvm_atomic_cas_gen_i_sys, Tys: {ElemTy, Ptr->getType()}),
670 Args: {Ptr, EmitScalarExpr(E: E->getArg(Arg: 1)), EmitScalarExpr(E: E->getArg(Arg: 2))});
671 }
672 case NVPTX::BI__nvvm_match_all_sync_i32p:
673 case NVPTX::BI__nvvm_match_all_sync_i64p: {
674 Value *Mask = EmitScalarExpr(E: E->getArg(Arg: 0));
675 Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1));
676 Address PredOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 2));
677 Value *ResultPair = Builder.CreateCall(
678 Callee: CGM.getIntrinsic(IID: BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
679 ? Intrinsic::nvvm_match_all_sync_i32p
680 : Intrinsic::nvvm_match_all_sync_i64p),
681 Args: {Mask, Val});
682 Value *Pred = Builder.CreateZExt(V: Builder.CreateExtractValue(Agg: ResultPair, Idxs: 1),
683 DestTy: PredOutPtr.getElementType());
684 Builder.CreateStore(Val: Pred, Addr: PredOutPtr);
685 return Builder.CreateExtractValue(Agg: ResultPair, Idxs: 0);
686 }
687
688 // FP MMA loads
689 case NVPTX::BI__hmma_m16n16k16_ld_a:
690 case NVPTX::BI__hmma_m16n16k16_ld_b:
691 case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
692 case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
693 case NVPTX::BI__hmma_m32n8k16_ld_a:
694 case NVPTX::BI__hmma_m32n8k16_ld_b:
695 case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
696 case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
697 case NVPTX::BI__hmma_m8n32k16_ld_a:
698 case NVPTX::BI__hmma_m8n32k16_ld_b:
699 case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
700 case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
701 // Integer MMA loads.
702 case NVPTX::BI__imma_m16n16k16_ld_a_s8:
703 case NVPTX::BI__imma_m16n16k16_ld_a_u8:
704 case NVPTX::BI__imma_m16n16k16_ld_b_s8:
705 case NVPTX::BI__imma_m16n16k16_ld_b_u8:
706 case NVPTX::BI__imma_m16n16k16_ld_c:
707 case NVPTX::BI__imma_m32n8k16_ld_a_s8:
708 case NVPTX::BI__imma_m32n8k16_ld_a_u8:
709 case NVPTX::BI__imma_m32n8k16_ld_b_s8:
710 case NVPTX::BI__imma_m32n8k16_ld_b_u8:
711 case NVPTX::BI__imma_m32n8k16_ld_c:
712 case NVPTX::BI__imma_m8n32k16_ld_a_s8:
713 case NVPTX::BI__imma_m8n32k16_ld_a_u8:
714 case NVPTX::BI__imma_m8n32k16_ld_b_s8:
715 case NVPTX::BI__imma_m8n32k16_ld_b_u8:
716 case NVPTX::BI__imma_m8n32k16_ld_c:
717 // Sub-integer MMA loads.
718 case NVPTX::BI__imma_m8n8k32_ld_a_s4:
719 case NVPTX::BI__imma_m8n8k32_ld_a_u4:
720 case NVPTX::BI__imma_m8n8k32_ld_b_s4:
721 case NVPTX::BI__imma_m8n8k32_ld_b_u4:
722 case NVPTX::BI__imma_m8n8k32_ld_c:
723 case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
724 case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
725 case NVPTX::BI__bmma_m8n8k128_ld_c:
726 // Double MMA loads.
727 case NVPTX::BI__dmma_m8n8k4_ld_a:
728 case NVPTX::BI__dmma_m8n8k4_ld_b:
729 case NVPTX::BI__dmma_m8n8k4_ld_c:
730 // Alternate float MMA loads.
731 case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
732 case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
733 case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
734 case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
735 case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
736 case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
737 case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
738 case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
739 case NVPTX::BI__mma_tf32_m16n16k8_ld_c: {
740 Address Dst = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
741 Value *Src = EmitScalarExpr(E: E->getArg(Arg: 1));
742 Value *Ldm = EmitScalarExpr(E: E->getArg(Arg: 2));
743 std::optional<llvm::APSInt> isColMajorArg =
744 E->getArg(Arg: 3)->getIntegerConstantExpr(Ctx: getContext());
745 if (!isColMajorArg)
746 return nullptr;
747 bool isColMajor = isColMajorArg->getSExtValue();
748 NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
749 unsigned IID = isColMajor ? II.IID_col : II.IID_row;
750 if (IID == 0)
751 return nullptr;
752
753 Value *Result =
754 Builder.CreateCall(Callee: CGM.getIntrinsic(IID, Tys: Src->getType()), Args: {Src, Ldm});
755
756 // Save returned values.
757 assert(II.NumResults);
758 if (II.NumResults == 1) {
759 Builder.CreateAlignedStore(Val: Result, Addr: Dst.emitRawPointer(CGF&: *this),
760 Align: CharUnits::fromQuantity(Quantity: 4));
761 } else {
762 for (unsigned i = 0; i < II.NumResults; ++i) {
763 Builder.CreateAlignedStore(
764 Val: Builder.CreateBitCast(V: Builder.CreateExtractValue(Agg: Result, Idxs: i),
765 DestTy: Dst.getElementType()),
766 Addr: Builder.CreateGEP(Ty: Dst.getElementType(), Ptr: Dst.emitRawPointer(CGF&: *this),
767 IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)),
768 Align: CharUnits::fromQuantity(Quantity: 4));
769 }
770 }
771 return Result;
772 }
773
774 case NVPTX::BI__hmma_m16n16k16_st_c_f16:
775 case NVPTX::BI__hmma_m16n16k16_st_c_f32:
776 case NVPTX::BI__hmma_m32n8k16_st_c_f16:
777 case NVPTX::BI__hmma_m32n8k16_st_c_f32:
778 case NVPTX::BI__hmma_m8n32k16_st_c_f16:
779 case NVPTX::BI__hmma_m8n32k16_st_c_f32:
780 case NVPTX::BI__imma_m16n16k16_st_c_i32:
781 case NVPTX::BI__imma_m32n8k16_st_c_i32:
782 case NVPTX::BI__imma_m8n32k16_st_c_i32:
783 case NVPTX::BI__imma_m8n8k32_st_c_i32:
784 case NVPTX::BI__bmma_m8n8k128_st_c_i32:
785 case NVPTX::BI__dmma_m8n8k4_st_c_f64:
786 case NVPTX::BI__mma_m16n16k8_st_c_f32: {
787 Value *Dst = EmitScalarExpr(E: E->getArg(Arg: 0));
788 Address Src = EmitPointerWithAlignment(Addr: E->getArg(Arg: 1));
789 Value *Ldm = EmitScalarExpr(E: E->getArg(Arg: 2));
790 std::optional<llvm::APSInt> isColMajorArg =
791 E->getArg(Arg: 3)->getIntegerConstantExpr(Ctx: getContext());
792 if (!isColMajorArg)
793 return nullptr;
794 bool isColMajor = isColMajorArg->getSExtValue();
795 NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
796 unsigned IID = isColMajor ? II.IID_col : II.IID_row;
797 if (IID == 0)
798 return nullptr;
799 Function *Intrinsic =
800 CGM.getIntrinsic(IID, Tys: Dst->getType());
801 llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(i: 1);
802 SmallVector<Value *, 10> Values = {Dst};
803 for (unsigned i = 0; i < II.NumResults; ++i) {
804 Value *V = Builder.CreateAlignedLoad(
805 Ty: Src.getElementType(),
806 Addr: Builder.CreateGEP(Ty: Src.getElementType(), Ptr: Src.emitRawPointer(CGF&: *this),
807 IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)),
808 Align: CharUnits::fromQuantity(Quantity: 4));
809 Values.push_back(Elt: Builder.CreateBitCast(V, DestTy: ParamType));
810 }
811 Values.push_back(Elt: Ldm);
812 Value *Result = Builder.CreateCall(Callee: Intrinsic, Args: Values);
813 return Result;
814 }
815
816 // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
817 // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
818 case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
819 case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
820 case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
821 case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
822 case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
823 case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
824 case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
825 case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
826 case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
827 case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
828 case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
829 case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
830 case NVPTX::BI__imma_m16n16k16_mma_s8:
831 case NVPTX::BI__imma_m16n16k16_mma_u8:
832 case NVPTX::BI__imma_m32n8k16_mma_s8:
833 case NVPTX::BI__imma_m32n8k16_mma_u8:
834 case NVPTX::BI__imma_m8n32k16_mma_s8:
835 case NVPTX::BI__imma_m8n32k16_mma_u8:
836 case NVPTX::BI__imma_m8n8k32_mma_s4:
837 case NVPTX::BI__imma_m8n8k32_mma_u4:
838 case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
839 case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
840 case NVPTX::BI__dmma_m8n8k4_mma_f64:
841 case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
842 case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
843 case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
844 case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: {
845 Address Dst = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0));
846 Address SrcA = EmitPointerWithAlignment(Addr: E->getArg(Arg: 1));
847 Address SrcB = EmitPointerWithAlignment(Addr: E->getArg(Arg: 2));
848 Address SrcC = EmitPointerWithAlignment(Addr: E->getArg(Arg: 3));
849 std::optional<llvm::APSInt> LayoutArg =
850 E->getArg(Arg: 4)->getIntegerConstantExpr(Ctx: getContext());
851 if (!LayoutArg)
852 return nullptr;
853 int Layout = LayoutArg->getSExtValue();
854 if (Layout < 0 || Layout > 3)
855 return nullptr;
856 llvm::APSInt SatfArg;
857 if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 ||
858 BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1)
859 SatfArg = 0; // .b1 does not have satf argument.
860 else if (std::optional<llvm::APSInt> OptSatfArg =
861 E->getArg(Arg: 5)->getIntegerConstantExpr(Ctx: getContext()))
862 SatfArg = *OptSatfArg;
863 else
864 return nullptr;
865 bool Satf = SatfArg.getSExtValue();
866 NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
867 unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
868 if (IID == 0) // Unsupported combination of Layout/Satf.
869 return nullptr;
870
871 SmallVector<Value *, 24> Values;
872 Function *Intrinsic = CGM.getIntrinsic(IID);
873 llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(i: 0);
874 // Load A
875 for (unsigned i = 0; i < MI.NumEltsA; ++i) {
876 Value *V = Builder.CreateAlignedLoad(
877 Ty: SrcA.getElementType(),
878 Addr: Builder.CreateGEP(Ty: SrcA.getElementType(), Ptr: SrcA.emitRawPointer(CGF&: *this),
879 IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)),
880 Align: CharUnits::fromQuantity(Quantity: 4));
881 Values.push_back(Elt: Builder.CreateBitCast(V, DestTy: AType));
882 }
883 // Load B
884 llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(i: MI.NumEltsA);
885 for (unsigned i = 0; i < MI.NumEltsB; ++i) {
886 Value *V = Builder.CreateAlignedLoad(
887 Ty: SrcB.getElementType(),
888 Addr: Builder.CreateGEP(Ty: SrcB.getElementType(), Ptr: SrcB.emitRawPointer(CGF&: *this),
889 IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)),
890 Align: CharUnits::fromQuantity(Quantity: 4));
891 Values.push_back(Elt: Builder.CreateBitCast(V, DestTy: BType));
892 }
893 // Load C
894 llvm::Type *CType =
895 Intrinsic->getFunctionType()->getParamType(i: MI.NumEltsA + MI.NumEltsB);
896 for (unsigned i = 0; i < MI.NumEltsC; ++i) {
897 Value *V = Builder.CreateAlignedLoad(
898 Ty: SrcC.getElementType(),
899 Addr: Builder.CreateGEP(Ty: SrcC.getElementType(), Ptr: SrcC.emitRawPointer(CGF&: *this),
900 IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)),
901 Align: CharUnits::fromQuantity(Quantity: 4));
902 Values.push_back(Elt: Builder.CreateBitCast(V, DestTy: CType));
903 }
904 Value *Result = Builder.CreateCall(Callee: Intrinsic, Args: Values);
905 llvm::Type *DType = Dst.getElementType();
906 for (unsigned i = 0; i < MI.NumEltsD; ++i)
907 Builder.CreateAlignedStore(
908 Val: Builder.CreateBitCast(V: Builder.CreateExtractValue(Agg: Result, Idxs: i), DestTy: DType),
909 Addr: Builder.CreateGEP(Ty: Dst.getElementType(), Ptr: Dst.emitRawPointer(CGF&: *this),
910 IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)),
911 Align: CharUnits::fromQuantity(Quantity: 4));
912 return Result;
913 }
914 // The following builtins require half type support
915 case NVPTX::BI__nvvm_ex2_approx_f16:
916 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, CGF&: *this);
917 case NVPTX::BI__nvvm_ex2_approx_f16x2:
918 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, CGF&: *this);
919 case NVPTX::BI__nvvm_ff2f16x2_rn:
920 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, CGF&: *this);
921 case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
922 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, CGF&: *this);
923 case NVPTX::BI__nvvm_ff2f16x2_rz:
924 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, CGF&: *this);
925 case NVPTX::BI__nvvm_ff2f16x2_rz_relu:
926 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, CGF&: *this);
927 case NVPTX::BI__nvvm_fma_rn_f16:
928 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, CGF&: *this);
929 case NVPTX::BI__nvvm_fma_rn_f16x2:
930 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, CGF&: *this);
931 case NVPTX::BI__nvvm_fma_rn_ftz_f16:
932 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, CGF&: *this);
933 case NVPTX::BI__nvvm_fma_rn_ftz_f16x2:
934 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, CGF&: *this);
935 case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16:
936 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E,
937 CGF&: *this);
938 case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2:
939 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E,
940 CGF&: *this);
941 case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16:
942 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E,
943 CGF&: *this);
944 case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2:
945 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E,
946 CGF&: *this);
947 case NVPTX::BI__nvvm_fma_rn_relu_f16:
948 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, CGF&: *this);
949 case NVPTX::BI__nvvm_fma_rn_relu_f16x2:
950 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, CGF&: *this);
951 case NVPTX::BI__nvvm_fma_rn_sat_f16:
952 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, CGF&: *this);
953 case NVPTX::BI__nvvm_fma_rn_sat_f16x2:
954 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, CGF&: *this);
955 case NVPTX::BI__nvvm_fmax_f16:
956 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_f16, BuiltinID, E, CGF&: *this);
957 case NVPTX::BI__nvvm_fmax_f16x2:
958 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, CGF&: *this);
959 case NVPTX::BI__nvvm_fmax_ftz_f16:
960 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, CGF&: *this);
961 case NVPTX::BI__nvvm_fmax_ftz_f16x2:
962 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, CGF&: *this);
963 case NVPTX::BI__nvvm_fmax_ftz_nan_f16:
964 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, CGF&: *this);
965 case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2:
966 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E,
967 CGF&: *this);
968 case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16:
969 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID,
970 E, CGF&: *this);
971 case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2:
972 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2,
973 BuiltinID, E, CGF&: *this);
974 case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16:
975 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E,
976 CGF&: *this);
977 case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2:
978 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID,
979 E, CGF&: *this);
980 case NVPTX::BI__nvvm_fmax_nan_f16:
981 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, CGF&: *this);
982 case NVPTX::BI__nvvm_fmax_nan_f16x2:
983 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, CGF&: *this);
984 case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16:
985 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E,
986 CGF&: *this);
987 case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2:
988 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID,
989 E, CGF&: *this);
990 case NVPTX::BI__nvvm_fmax_xorsign_abs_f16:
991 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E,
992 CGF&: *this);
993 case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2:
994 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E,
995 CGF&: *this);
996 case NVPTX::BI__nvvm_fmin_f16:
997 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_f16, BuiltinID, E, CGF&: *this);
998 case NVPTX::BI__nvvm_fmin_f16x2:
999 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, CGF&: *this);
1000 case NVPTX::BI__nvvm_fmin_ftz_f16:
1001 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, CGF&: *this);
1002 case NVPTX::BI__nvvm_fmin_ftz_f16x2:
1003 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, CGF&: *this);
1004 case NVPTX::BI__nvvm_fmin_ftz_nan_f16:
1005 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, CGF&: *this);
1006 case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2:
1007 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E,
1008 CGF&: *this);
1009 case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16:
1010 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID,
1011 E, CGF&: *this);
1012 case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2:
1013 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
1014 BuiltinID, E, CGF&: *this);
1015 case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16:
1016 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E,
1017 CGF&: *this);
1018 case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2:
1019 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID,
1020 E, CGF&: *this);
1021 case NVPTX::BI__nvvm_fmin_nan_f16:
1022 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, CGF&: *this);
1023 case NVPTX::BI__nvvm_fmin_nan_f16x2:
1024 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, CGF&: *this);
1025 case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16:
1026 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E,
1027 CGF&: *this);
1028 case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2:
1029 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID,
1030 E, CGF&: *this);
1031 case NVPTX::BI__nvvm_fmin_xorsign_abs_f16:
1032 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E,
1033 CGF&: *this);
1034 case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2:
1035 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
1036 CGF&: *this);
1037 case NVPTX::BI__nvvm_fabs_f:
1038 case NVPTX::BI__nvvm_abs_bf16:
1039 case NVPTX::BI__nvvm_abs_bf16x2:
1040 case NVPTX::BI__nvvm_fabs_f16:
1041 case NVPTX::BI__nvvm_fabs_f16x2:
1042 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::nvvm_fabs,
1043 V: EmitScalarExpr(E: E->getArg(Arg: 0)));
1044 case NVPTX::BI__nvvm_fabs_ftz_f:
1045 case NVPTX::BI__nvvm_fabs_ftz_f16:
1046 case NVPTX::BI__nvvm_fabs_ftz_f16x2:
1047 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::nvvm_fabs_ftz,
1048 V: EmitScalarExpr(E: E->getArg(Arg: 0)));
1049 case NVPTX::BI__nvvm_fabs_d:
1050 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs,
1051 V: EmitScalarExpr(E: E->getArg(Arg: 0)));
1052 case NVPTX::BI__nvvm_ldg_h:
1053 case NVPTX::BI__nvvm_ldg_h2:
1054 return MakeHalfType(IntrinsicID: Intrinsic::not_intrinsic, BuiltinID, E, CGF&: *this);
1055 case NVPTX::BI__nvvm_ldu_h:
1056 case NVPTX::BI__nvvm_ldu_h2:
1057 return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ldu_global_f, BuiltinID, E, CGF&: *this);
1058 case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
1059 return MakeCpAsync(IntrinsicID: Intrinsic::nvvm_cp_async_ca_shared_global_4,
1060 IntrinsicIDS: Intrinsic::nvvm_cp_async_ca_shared_global_4_s, CGF&: *this, E,
1061 SrcSize: 4);
1062 case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
1063 return MakeCpAsync(IntrinsicID: Intrinsic::nvvm_cp_async_ca_shared_global_8,
1064 IntrinsicIDS: Intrinsic::nvvm_cp_async_ca_shared_global_8_s, CGF&: *this, E,
1065 SrcSize: 8);
1066 case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
1067 return MakeCpAsync(IntrinsicID: Intrinsic::nvvm_cp_async_ca_shared_global_16,
1068 IntrinsicIDS: Intrinsic::nvvm_cp_async_ca_shared_global_16_s, CGF&: *this, E,
1069 SrcSize: 16);
1070 case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
1071 return MakeCpAsync(IntrinsicID: Intrinsic::nvvm_cp_async_cg_shared_global_16,
1072 IntrinsicIDS: Intrinsic::nvvm_cp_async_cg_shared_global_16_s, CGF&: *this, E,
1073 SrcSize: 16);
1074 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x:
1075 return Builder.CreateCall(
1076 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_clusterid_x));
1077 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y:
1078 return Builder.CreateCall(
1079 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_clusterid_y));
1080 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z:
1081 return Builder.CreateCall(
1082 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_clusterid_z));
1083 case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w:
1084 return Builder.CreateCall(
1085 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_clusterid_w));
1086 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x:
1087 return Builder.CreateCall(
1088 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_nclusterid_x));
1089 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y:
1090 return Builder.CreateCall(
1091 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_nclusterid_y));
1092 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z:
1093 return Builder.CreateCall(
1094 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_nclusterid_z));
1095 case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w:
1096 return Builder.CreateCall(
1097 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_nclusterid_w));
1098 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x:
1099 return Builder.CreateCall(
1100 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x));
1101 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y:
1102 return Builder.CreateCall(
1103 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y));
1104 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z:
1105 return Builder.CreateCall(
1106 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z));
1107 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w:
1108 return Builder.CreateCall(
1109 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w));
1110 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x:
1111 return Builder.CreateCall(
1112 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x));
1113 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y:
1114 return Builder.CreateCall(
1115 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y));
1116 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z:
1117 return Builder.CreateCall(
1118 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z));
1119 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w:
1120 return Builder.CreateCall(
1121 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w));
1122 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank:
1123 return Builder.CreateCall(
1124 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank));
1125 case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank:
1126 return Builder.CreateCall(
1127 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank));
1128 case NVPTX::BI__nvvm_is_explicit_cluster:
1129 return Builder.CreateCall(
1130 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_is_explicit_cluster));
1131 case NVPTX::BI__nvvm_isspacep_shared_cluster:
1132 return Builder.CreateCall(
1133 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_isspacep_shared_cluster),
1134 Args: EmitScalarExpr(E: E->getArg(Arg: 0)));
1135 case NVPTX::BI__nvvm_mapa:
1136 return Builder.CreateCall(
1137 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_mapa),
1138 Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1))});
1139 case NVPTX::BI__nvvm_mapa_shared_cluster:
1140 return Builder.CreateCall(
1141 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_mapa_shared_cluster),
1142 Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1))});
1143 case NVPTX::BI__nvvm_getctarank:
1144 return Builder.CreateCall(
1145 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_getctarank),
1146 Args: EmitScalarExpr(E: E->getArg(Arg: 0)));
1147 case NVPTX::BI__nvvm_getctarank_shared_cluster:
1148 return Builder.CreateCall(
1149 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_getctarank_shared_cluster),
1150 Args: EmitScalarExpr(E: E->getArg(Arg: 0)));
1151 case NVPTX::BI__nvvm_barrier_cluster_arrive:
1152 return Builder.CreateCall(
1153 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cluster_arrive));
1154 case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
1155 return Builder.CreateCall(
1156 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
1157 case NVPTX::BI__nvvm_barrier_cluster_wait:
1158 return Builder.CreateCall(
1159 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cluster_wait));
1160 case NVPTX::BI__nvvm_fence_sc_cluster:
1161 return Builder.CreateCall(
1162 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_fence_sc_cluster));
1163 case NVPTX::BI__nvvm_bar_sync:
1164 return Builder.CreateCall(
1165 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1166 Args: EmitScalarExpr(E: E->getArg(Arg: 0)));
1167 case NVPTX::BI__syncthreads:
1168 return Builder.CreateCall(
1169 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cta_sync_aligned_all),
1170 Args: Builder.getInt32(C: 0));
1171 case NVPTX::BI__nvvm_barrier_sync:
1172 return Builder.CreateCall(
1173 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cta_sync_all),
1174 Args: EmitScalarExpr(E: E->getArg(Arg: 0)));
1175 case NVPTX::BI__nvvm_barrier_sync_cnt:
1176 return Builder.CreateCall(
1177 Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cta_sync_count),
1178 Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1))});
1179 default:
1180 return nullptr;
1181 }
1182}
1183