1 | //===-------- NVPTX.cpp - Emit LLVM Code for builtins ---------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This contains code to emit Builtin calls as LLVM code. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "CGBuiltin.h" |
14 | #include "clang/Basic/TargetBuiltins.h" |
15 | #include "llvm/IR/IntrinsicsNVPTX.h" |
16 | |
17 | using namespace clang; |
18 | using namespace CodeGen; |
19 | using namespace llvm; |
20 | |
21 | namespace { |
22 | // Helper classes for mapping MMA builtins to particular LLVM intrinsic variant. |
23 | struct NVPTXMmaLdstInfo { |
24 | unsigned NumResults; // Number of elements to load/store |
25 | // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported. |
26 | unsigned IID_col; |
27 | unsigned IID_row; |
28 | }; |
29 | |
30 | #define MMA_INTR(geom_op_type, layout) \ |
31 | Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride |
32 | #define MMA_LDST(n, geom_op_type) \ |
33 | { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) } |
34 | |
35 | static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) { |
36 | switch (BuiltinID) { |
37 | // FP MMA loads |
38 | case NVPTX::BI__hmma_m16n16k16_ld_a: |
39 | return MMA_LDST(8, m16n16k16_load_a_f16); |
40 | case NVPTX::BI__hmma_m16n16k16_ld_b: |
41 | return MMA_LDST(8, m16n16k16_load_b_f16); |
42 | case NVPTX::BI__hmma_m16n16k16_ld_c_f16: |
43 | return MMA_LDST(4, m16n16k16_load_c_f16); |
44 | case NVPTX::BI__hmma_m16n16k16_ld_c_f32: |
45 | return MMA_LDST(8, m16n16k16_load_c_f32); |
46 | case NVPTX::BI__hmma_m32n8k16_ld_a: |
47 | return MMA_LDST(8, m32n8k16_load_a_f16); |
48 | case NVPTX::BI__hmma_m32n8k16_ld_b: |
49 | return MMA_LDST(8, m32n8k16_load_b_f16); |
50 | case NVPTX::BI__hmma_m32n8k16_ld_c_f16: |
51 | return MMA_LDST(4, m32n8k16_load_c_f16); |
52 | case NVPTX::BI__hmma_m32n8k16_ld_c_f32: |
53 | return MMA_LDST(8, m32n8k16_load_c_f32); |
54 | case NVPTX::BI__hmma_m8n32k16_ld_a: |
55 | return MMA_LDST(8, m8n32k16_load_a_f16); |
56 | case NVPTX::BI__hmma_m8n32k16_ld_b: |
57 | return MMA_LDST(8, m8n32k16_load_b_f16); |
58 | case NVPTX::BI__hmma_m8n32k16_ld_c_f16: |
59 | return MMA_LDST(4, m8n32k16_load_c_f16); |
60 | case NVPTX::BI__hmma_m8n32k16_ld_c_f32: |
61 | return MMA_LDST(8, m8n32k16_load_c_f32); |
62 | |
63 | // Integer MMA loads |
64 | case NVPTX::BI__imma_m16n16k16_ld_a_s8: |
65 | return MMA_LDST(2, m16n16k16_load_a_s8); |
66 | case NVPTX::BI__imma_m16n16k16_ld_a_u8: |
67 | return MMA_LDST(2, m16n16k16_load_a_u8); |
68 | case NVPTX::BI__imma_m16n16k16_ld_b_s8: |
69 | return MMA_LDST(2, m16n16k16_load_b_s8); |
70 | case NVPTX::BI__imma_m16n16k16_ld_b_u8: |
71 | return MMA_LDST(2, m16n16k16_load_b_u8); |
72 | case NVPTX::BI__imma_m16n16k16_ld_c: |
73 | return MMA_LDST(8, m16n16k16_load_c_s32); |
74 | case NVPTX::BI__imma_m32n8k16_ld_a_s8: |
75 | return MMA_LDST(4, m32n8k16_load_a_s8); |
76 | case NVPTX::BI__imma_m32n8k16_ld_a_u8: |
77 | return MMA_LDST(4, m32n8k16_load_a_u8); |
78 | case NVPTX::BI__imma_m32n8k16_ld_b_s8: |
79 | return MMA_LDST(1, m32n8k16_load_b_s8); |
80 | case NVPTX::BI__imma_m32n8k16_ld_b_u8: |
81 | return MMA_LDST(1, m32n8k16_load_b_u8); |
82 | case NVPTX::BI__imma_m32n8k16_ld_c: |
83 | return MMA_LDST(8, m32n8k16_load_c_s32); |
84 | case NVPTX::BI__imma_m8n32k16_ld_a_s8: |
85 | return MMA_LDST(1, m8n32k16_load_a_s8); |
86 | case NVPTX::BI__imma_m8n32k16_ld_a_u8: |
87 | return MMA_LDST(1, m8n32k16_load_a_u8); |
88 | case NVPTX::BI__imma_m8n32k16_ld_b_s8: |
89 | return MMA_LDST(4, m8n32k16_load_b_s8); |
90 | case NVPTX::BI__imma_m8n32k16_ld_b_u8: |
91 | return MMA_LDST(4, m8n32k16_load_b_u8); |
92 | case NVPTX::BI__imma_m8n32k16_ld_c: |
93 | return MMA_LDST(8, m8n32k16_load_c_s32); |
94 | |
95 | // Sub-integer MMA loads. |
96 | // Only row/col layout is supported by A/B fragments. |
97 | case NVPTX::BI__imma_m8n8k32_ld_a_s4: |
98 | return {.NumResults: 1, .IID_col: 0, MMA_INTR(m8n8k32_load_a_s4, row)}; |
99 | case NVPTX::BI__imma_m8n8k32_ld_a_u4: |
100 | return {.NumResults: 1, .IID_col: 0, MMA_INTR(m8n8k32_load_a_u4, row)}; |
101 | case NVPTX::BI__imma_m8n8k32_ld_b_s4: |
102 | return {.NumResults: 1, MMA_INTR(m8n8k32_load_b_s4, col), .IID_row: 0}; |
103 | case NVPTX::BI__imma_m8n8k32_ld_b_u4: |
104 | return {.NumResults: 1, MMA_INTR(m8n8k32_load_b_u4, col), .IID_row: 0}; |
105 | case NVPTX::BI__imma_m8n8k32_ld_c: |
106 | return MMA_LDST(2, m8n8k32_load_c_s32); |
107 | case NVPTX::BI__bmma_m8n8k128_ld_a_b1: |
108 | return {.NumResults: 1, .IID_col: 0, MMA_INTR(m8n8k128_load_a_b1, row)}; |
109 | case NVPTX::BI__bmma_m8n8k128_ld_b_b1: |
110 | return {.NumResults: 1, MMA_INTR(m8n8k128_load_b_b1, col), .IID_row: 0}; |
111 | case NVPTX::BI__bmma_m8n8k128_ld_c: |
112 | return MMA_LDST(2, m8n8k128_load_c_s32); |
113 | |
114 | // Double MMA loads |
115 | case NVPTX::BI__dmma_m8n8k4_ld_a: |
116 | return MMA_LDST(1, m8n8k4_load_a_f64); |
117 | case NVPTX::BI__dmma_m8n8k4_ld_b: |
118 | return MMA_LDST(1, m8n8k4_load_b_f64); |
119 | case NVPTX::BI__dmma_m8n8k4_ld_c: |
120 | return MMA_LDST(2, m8n8k4_load_c_f64); |
121 | |
122 | // Alternate float MMA loads |
123 | case NVPTX::BI__mma_bf16_m16n16k16_ld_a: |
124 | return MMA_LDST(4, m16n16k16_load_a_bf16); |
125 | case NVPTX::BI__mma_bf16_m16n16k16_ld_b: |
126 | return MMA_LDST(4, m16n16k16_load_b_bf16); |
127 | case NVPTX::BI__mma_bf16_m8n32k16_ld_a: |
128 | return MMA_LDST(2, m8n32k16_load_a_bf16); |
129 | case NVPTX::BI__mma_bf16_m8n32k16_ld_b: |
130 | return MMA_LDST(8, m8n32k16_load_b_bf16); |
131 | case NVPTX::BI__mma_bf16_m32n8k16_ld_a: |
132 | return MMA_LDST(8, m32n8k16_load_a_bf16); |
133 | case NVPTX::BI__mma_bf16_m32n8k16_ld_b: |
134 | return MMA_LDST(2, m32n8k16_load_b_bf16); |
135 | case NVPTX::BI__mma_tf32_m16n16k8_ld_a: |
136 | return MMA_LDST(4, m16n16k8_load_a_tf32); |
137 | case NVPTX::BI__mma_tf32_m16n16k8_ld_b: |
138 | return MMA_LDST(4, m16n16k8_load_b_tf32); |
139 | case NVPTX::BI__mma_tf32_m16n16k8_ld_c: |
140 | return MMA_LDST(8, m16n16k8_load_c_f32); |
141 | |
142 | // NOTE: We need to follow inconsitent naming scheme used by NVCC. Unlike |
143 | // PTX and LLVM IR where stores always use fragment D, NVCC builtins always |
144 | // use fragment C for both loads and stores. |
145 | // FP MMA stores. |
146 | case NVPTX::BI__hmma_m16n16k16_st_c_f16: |
147 | return MMA_LDST(4, m16n16k16_store_d_f16); |
148 | case NVPTX::BI__hmma_m16n16k16_st_c_f32: |
149 | return MMA_LDST(8, m16n16k16_store_d_f32); |
150 | case NVPTX::BI__hmma_m32n8k16_st_c_f16: |
151 | return MMA_LDST(4, m32n8k16_store_d_f16); |
152 | case NVPTX::BI__hmma_m32n8k16_st_c_f32: |
153 | return MMA_LDST(8, m32n8k16_store_d_f32); |
154 | case NVPTX::BI__hmma_m8n32k16_st_c_f16: |
155 | return MMA_LDST(4, m8n32k16_store_d_f16); |
156 | case NVPTX::BI__hmma_m8n32k16_st_c_f32: |
157 | return MMA_LDST(8, m8n32k16_store_d_f32); |
158 | |
159 | // Integer and sub-integer MMA stores. |
160 | // Another naming quirk. Unlike other MMA builtins that use PTX types in the |
161 | // name, integer loads/stores use LLVM's i32. |
162 | case NVPTX::BI__imma_m16n16k16_st_c_i32: |
163 | return MMA_LDST(8, m16n16k16_store_d_s32); |
164 | case NVPTX::BI__imma_m32n8k16_st_c_i32: |
165 | return MMA_LDST(8, m32n8k16_store_d_s32); |
166 | case NVPTX::BI__imma_m8n32k16_st_c_i32: |
167 | return MMA_LDST(8, m8n32k16_store_d_s32); |
168 | case NVPTX::BI__imma_m8n8k32_st_c_i32: |
169 | return MMA_LDST(2, m8n8k32_store_d_s32); |
170 | case NVPTX::BI__bmma_m8n8k128_st_c_i32: |
171 | return MMA_LDST(2, m8n8k128_store_d_s32); |
172 | |
173 | // Double MMA store |
174 | case NVPTX::BI__dmma_m8n8k4_st_c_f64: |
175 | return MMA_LDST(2, m8n8k4_store_d_f64); |
176 | |
177 | // Alternate float MMA store |
178 | case NVPTX::BI__mma_m16n16k8_st_c_f32: |
179 | return MMA_LDST(8, m16n16k8_store_d_f32); |
180 | |
181 | default: |
182 | llvm_unreachable("Unknown MMA builtin" ); |
183 | } |
184 | } |
185 | #undef MMA_LDST |
186 | #undef MMA_INTR |
187 | |
188 | |
189 | struct NVPTXMmaInfo { |
190 | unsigned NumEltsA; |
191 | unsigned NumEltsB; |
192 | unsigned NumEltsC; |
193 | unsigned NumEltsD; |
194 | |
195 | // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority |
196 | // over 'col' for layout. The index of non-satf variants is expected to match |
197 | // the undocumented layout constants used by CUDA's mma.hpp. |
198 | std::array<unsigned, 8> Variants; |
199 | |
200 | unsigned getMMAIntrinsic(int Layout, bool Satf) { |
201 | unsigned Index = Layout + 4 * Satf; |
202 | if (Index >= Variants.size()) |
203 | return 0; |
204 | return Variants[Index]; |
205 | } |
206 | }; |
207 | |
208 | // Returns an intrinsic that matches Layout and Satf for valid combinations of |
209 | // Layout and Satf, 0 otherwise. |
210 | static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) { |
211 | // clang-format off |
212 | #define MMA_VARIANTS(geom, type) \ |
213 | Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \ |
214 | Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ |
215 | Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \ |
216 | Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type |
217 | #define MMA_SATF_VARIANTS(geom, type) \ |
218 | MMA_VARIANTS(geom, type), \ |
219 | Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \ |
220 | Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ |
221 | Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \ |
222 | Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite |
223 | // Sub-integer MMA only supports row.col layout. |
224 | #define MMA_VARIANTS_I4(geom, type) \ |
225 | 0, \ |
226 | Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \ |
227 | 0, \ |
228 | 0, \ |
229 | 0, \ |
230 | Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \ |
231 | 0, \ |
232 | 0 |
233 | // b1 MMA does not support .satfinite. |
234 | #define MMA_VARIANTS_B1_XOR(geom, type) \ |
235 | 0, \ |
236 | Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type, \ |
237 | 0, \ |
238 | 0, \ |
239 | 0, \ |
240 | 0, \ |
241 | 0, \ |
242 | 0 |
243 | #define MMA_VARIANTS_B1_AND(geom, type) \ |
244 | 0, \ |
245 | Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type, \ |
246 | 0, \ |
247 | 0, \ |
248 | 0, \ |
249 | 0, \ |
250 | 0, \ |
251 | 0 |
252 | // clang-format on |
253 | switch (BuiltinID) { |
254 | // FP MMA |
255 | // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while |
256 | // NumEltsN of return value are ordered as A,B,C,D. |
257 | case NVPTX::BI__hmma_m16n16k16_mma_f16f16: |
258 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}}; |
259 | case NVPTX::BI__hmma_m16n16k16_mma_f32f16: |
260 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}}; |
261 | case NVPTX::BI__hmma_m16n16k16_mma_f16f32: |
262 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}}; |
263 | case NVPTX::BI__hmma_m16n16k16_mma_f32f32: |
264 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}}; |
265 | case NVPTX::BI__hmma_m32n8k16_mma_f16f16: |
266 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}}; |
267 | case NVPTX::BI__hmma_m32n8k16_mma_f32f16: |
268 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}}; |
269 | case NVPTX::BI__hmma_m32n8k16_mma_f16f32: |
270 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}}; |
271 | case NVPTX::BI__hmma_m32n8k16_mma_f32f32: |
272 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}}; |
273 | case NVPTX::BI__hmma_m8n32k16_mma_f16f16: |
274 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}}; |
275 | case NVPTX::BI__hmma_m8n32k16_mma_f32f16: |
276 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 4, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}}; |
277 | case NVPTX::BI__hmma_m8n32k16_mma_f16f32: |
278 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 4, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}}; |
279 | case NVPTX::BI__hmma_m8n32k16_mma_f32f32: |
280 | return {.NumEltsA: 8, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}}; |
281 | |
282 | // Integer MMA |
283 | case NVPTX::BI__imma_m16n16k16_mma_s8: |
284 | return {.NumEltsA: 2, .NumEltsB: 2, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, s8)}}}; |
285 | case NVPTX::BI__imma_m16n16k16_mma_u8: |
286 | return {.NumEltsA: 2, .NumEltsB: 2, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m16n16k16, u8)}}}; |
287 | case NVPTX::BI__imma_m32n8k16_mma_s8: |
288 | return {.NumEltsA: 4, .NumEltsB: 1, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, s8)}}}; |
289 | case NVPTX::BI__imma_m32n8k16_mma_u8: |
290 | return {.NumEltsA: 4, .NumEltsB: 1, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m32n8k16, u8)}}}; |
291 | case NVPTX::BI__imma_m8n32k16_mma_s8: |
292 | return {.NumEltsA: 1, .NumEltsB: 4, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, s8)}}}; |
293 | case NVPTX::BI__imma_m8n32k16_mma_u8: |
294 | return {.NumEltsA: 1, .NumEltsB: 4, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_SATF_VARIANTS(m8n32k16, u8)}}}; |
295 | |
296 | // Sub-integer MMA |
297 | case NVPTX::BI__imma_m8n8k32_mma_s4: |
298 | return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS_I4(m8n8k32, s4)}}}; |
299 | case NVPTX::BI__imma_m8n8k32_mma_u4: |
300 | return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS_I4(m8n8k32, u4)}}}; |
301 | case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1: |
302 | return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}}; |
303 | case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1: |
304 | return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS_B1_AND(m8n8k128, b1)}}}; |
305 | |
306 | // Double MMA |
307 | case NVPTX::BI__dmma_m8n8k4_mma_f64: |
308 | return {.NumEltsA: 1, .NumEltsB: 1, .NumEltsC: 2, .NumEltsD: 2, .Variants: {._M_elems: {MMA_VARIANTS(m8n8k4, f64)}}}; |
309 | |
310 | // Alternate FP MMA |
311 | case NVPTX::BI__mma_bf16_m16n16k16_mma_f32: |
312 | return {.NumEltsA: 4, .NumEltsB: 4, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_VARIANTS(m16n16k16, bf16)}}}; |
313 | case NVPTX::BI__mma_bf16_m8n32k16_mma_f32: |
314 | return {.NumEltsA: 2, .NumEltsB: 8, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_VARIANTS(m8n32k16, bf16)}}}; |
315 | case NVPTX::BI__mma_bf16_m32n8k16_mma_f32: |
316 | return {.NumEltsA: 8, .NumEltsB: 2, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_VARIANTS(m32n8k16, bf16)}}}; |
317 | case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: |
318 | return {.NumEltsA: 4, .NumEltsB: 4, .NumEltsC: 8, .NumEltsD: 8, .Variants: {._M_elems: {MMA_VARIANTS(m16n16k8, tf32)}}}; |
319 | default: |
320 | llvm_unreachable("Unexpected builtin ID." ); |
321 | } |
322 | #undef MMA_VARIANTS |
323 | #undef MMA_SATF_VARIANTS |
324 | #undef MMA_VARIANTS_I4 |
325 | #undef MMA_VARIANTS_B1_AND |
326 | #undef MMA_VARIANTS_B1_XOR |
327 | } |
328 | |
329 | static Value *MakeLdu(unsigned IntrinsicID, CodeGenFunction &CGF, |
330 | const CallExpr *E) { |
331 | Value *Ptr = CGF.EmitScalarExpr(E: E->getArg(Arg: 0)); |
332 | QualType ArgType = E->getArg(Arg: 0)->getType(); |
333 | clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(T: ArgType); |
334 | llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ArgType->getPointeeType()); |
335 | return CGF.Builder.CreateCall( |
336 | Callee: CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {ElemTy, Ptr->getType()}), |
337 | Args: {Ptr, ConstantInt::get(Ty: CGF.Builder.getInt32Ty(), V: Align.getQuantity())}); |
338 | } |
339 | |
340 | static Value *MakeLdg(CodeGenFunction &CGF, const CallExpr *E) { |
341 | Value *Ptr = CGF.EmitScalarExpr(E: E->getArg(Arg: 0)); |
342 | QualType ArgType = E->getArg(Arg: 0)->getType(); |
343 | clang::CharUnits AlignV = CGF.CGM.getNaturalPointeeTypeAlignment(T: ArgType); |
344 | llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ArgType->getPointeeType()); |
345 | |
346 | // Use addrspace(1) for NVPTX ADDRESS_SPACE_GLOBAL |
347 | auto *ASC = CGF.Builder.CreateAddrSpaceCast(V: Ptr, DestTy: CGF.Builder.getPtrTy(AddrSpace: 1)); |
348 | auto *LD = CGF.Builder.CreateAlignedLoad(Ty: ElemTy, Ptr: ASC, Align: AlignV.getAsAlign()); |
349 | MDNode *MD = MDNode::get(Context&: CGF.Builder.getContext(), MDs: {}); |
350 | LD->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD); |
351 | |
352 | return LD; |
353 | } |
354 | |
355 | static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF, |
356 | const CallExpr *E) { |
357 | Value *Ptr = CGF.EmitScalarExpr(E: E->getArg(Arg: 0)); |
358 | llvm::Type *ElemTy = |
359 | CGF.ConvertTypeForMem(T: E->getArg(Arg: 0)->getType()->getPointeeType()); |
360 | return CGF.Builder.CreateCall( |
361 | Callee: CGF.CGM.getIntrinsic(IID: IntrinsicID, Tys: {ElemTy, Ptr->getType()}), |
362 | Args: {Ptr, CGF.EmitScalarExpr(E: E->getArg(Arg: 1))}); |
363 | } |
364 | |
365 | static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS, |
366 | CodeGenFunction &CGF, const CallExpr *E, |
367 | int SrcSize) { |
368 | return E->getNumArgs() == 3 |
369 | ? CGF.Builder.CreateCall(Callee: CGF.CGM.getIntrinsic(IID: IntrinsicIDS), |
370 | Args: {CGF.EmitScalarExpr(E: E->getArg(Arg: 0)), |
371 | CGF.EmitScalarExpr(E: E->getArg(Arg: 1)), |
372 | CGF.EmitScalarExpr(E: E->getArg(Arg: 2))}) |
373 | : CGF.Builder.CreateCall(Callee: CGF.CGM.getIntrinsic(IID: IntrinsicID), |
374 | Args: {CGF.EmitScalarExpr(E: E->getArg(Arg: 0)), |
375 | CGF.EmitScalarExpr(E: E->getArg(Arg: 1))}); |
376 | } |
377 | |
378 | static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, |
379 | const CallExpr *E, CodeGenFunction &CGF) { |
380 | auto &C = CGF.CGM.getContext(); |
381 | if (!(C.getLangOpts().NativeHalfType || |
382 | !C.getTargetInfo().useFP16ConversionIntrinsics())) { |
383 | CGF.CGM.Error(loc: E->getExprLoc(), error: C.BuiltinInfo.getQuotedName(ID: BuiltinID) + |
384 | " requires native half type support." ); |
385 | return nullptr; |
386 | } |
387 | |
388 | if (BuiltinID == NVPTX::BI__nvvm_ldg_h || BuiltinID == NVPTX::BI__nvvm_ldg_h2) |
389 | return MakeLdg(CGF, E); |
390 | |
391 | if (IntrinsicID == Intrinsic::nvvm_ldu_global_f) |
392 | return MakeLdu(IntrinsicID, CGF, E); |
393 | |
394 | SmallVector<Value *, 16> Args; |
395 | auto *F = CGF.CGM.getIntrinsic(IID: IntrinsicID); |
396 | auto *FTy = F->getFunctionType(); |
397 | unsigned ICEArguments = 0; |
398 | ASTContext::GetBuiltinTypeError Error; |
399 | C.GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments); |
400 | assert(Error == ASTContext::GE_None && "Should not codegen an error" ); |
401 | for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) { |
402 | assert((ICEArguments & (1 << i)) == 0); |
403 | auto *ArgValue = CGF.EmitScalarExpr(E: E->getArg(Arg: i)); |
404 | auto *PTy = FTy->getParamType(i); |
405 | if (PTy != ArgValue->getType()) |
406 | ArgValue = CGF.Builder.CreateBitCast(V: ArgValue, DestTy: PTy); |
407 | Args.push_back(Elt: ArgValue); |
408 | } |
409 | |
410 | return CGF.Builder.CreateCall(Callee: F, Args); |
411 | } |
412 | } // namespace |
413 | |
414 | Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, |
415 | const CallExpr *E) { |
416 | switch (BuiltinID) { |
417 | case NVPTX::BI__nvvm_atom_add_gen_i: |
418 | case NVPTX::BI__nvvm_atom_add_gen_l: |
419 | case NVPTX::BI__nvvm_atom_add_gen_ll: |
420 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Add, E); |
421 | |
422 | case NVPTX::BI__nvvm_atom_sub_gen_i: |
423 | case NVPTX::BI__nvvm_atom_sub_gen_l: |
424 | case NVPTX::BI__nvvm_atom_sub_gen_ll: |
425 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Sub, E); |
426 | |
427 | case NVPTX::BI__nvvm_atom_and_gen_i: |
428 | case NVPTX::BI__nvvm_atom_and_gen_l: |
429 | case NVPTX::BI__nvvm_atom_and_gen_ll: |
430 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::And, E); |
431 | |
432 | case NVPTX::BI__nvvm_atom_or_gen_i: |
433 | case NVPTX::BI__nvvm_atom_or_gen_l: |
434 | case NVPTX::BI__nvvm_atom_or_gen_ll: |
435 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Or, E); |
436 | |
437 | case NVPTX::BI__nvvm_atom_xor_gen_i: |
438 | case NVPTX::BI__nvvm_atom_xor_gen_l: |
439 | case NVPTX::BI__nvvm_atom_xor_gen_ll: |
440 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Xor, E); |
441 | |
442 | case NVPTX::BI__nvvm_atom_xchg_gen_i: |
443 | case NVPTX::BI__nvvm_atom_xchg_gen_l: |
444 | case NVPTX::BI__nvvm_atom_xchg_gen_ll: |
445 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Xchg, E); |
446 | |
447 | case NVPTX::BI__nvvm_atom_max_gen_i: |
448 | case NVPTX::BI__nvvm_atom_max_gen_l: |
449 | case NVPTX::BI__nvvm_atom_max_gen_ll: |
450 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Max, E); |
451 | |
452 | case NVPTX::BI__nvvm_atom_max_gen_ui: |
453 | case NVPTX::BI__nvvm_atom_max_gen_ul: |
454 | case NVPTX::BI__nvvm_atom_max_gen_ull: |
455 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::UMax, E); |
456 | |
457 | case NVPTX::BI__nvvm_atom_min_gen_i: |
458 | case NVPTX::BI__nvvm_atom_min_gen_l: |
459 | case NVPTX::BI__nvvm_atom_min_gen_ll: |
460 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::Min, E); |
461 | |
462 | case NVPTX::BI__nvvm_atom_min_gen_ui: |
463 | case NVPTX::BI__nvvm_atom_min_gen_ul: |
464 | case NVPTX::BI__nvvm_atom_min_gen_ull: |
465 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::UMin, E); |
466 | |
467 | case NVPTX::BI__nvvm_atom_cas_gen_us: |
468 | case NVPTX::BI__nvvm_atom_cas_gen_i: |
469 | case NVPTX::BI__nvvm_atom_cas_gen_l: |
470 | case NVPTX::BI__nvvm_atom_cas_gen_ll: |
471 | // __nvvm_atom_cas_gen_* should return the old value rather than the |
472 | // success flag. |
473 | return MakeAtomicCmpXchgValue(CGF&: *this, E, /*ReturnBool=*/false); |
474 | |
475 | case NVPTX::BI__nvvm_atom_add_gen_f: |
476 | case NVPTX::BI__nvvm_atom_add_gen_d: { |
477 | Address DestAddr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0)); |
478 | Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1)); |
479 | |
480 | return Builder.CreateAtomicRMW(Op: llvm::AtomicRMWInst::FAdd, Addr: DestAddr, Val, |
481 | Ordering: AtomicOrdering::SequentiallyConsistent); |
482 | } |
483 | |
484 | case NVPTX::BI__nvvm_atom_inc_gen_ui: |
485 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::UIncWrap, E); |
486 | |
487 | case NVPTX::BI__nvvm_atom_dec_gen_ui: |
488 | return MakeBinaryAtomicValue(CGF&: *this, Kind: llvm::AtomicRMWInst::UDecWrap, E); |
489 | |
490 | case NVPTX::BI__nvvm_ldg_c: |
491 | case NVPTX::BI__nvvm_ldg_sc: |
492 | case NVPTX::BI__nvvm_ldg_c2: |
493 | case NVPTX::BI__nvvm_ldg_sc2: |
494 | case NVPTX::BI__nvvm_ldg_c4: |
495 | case NVPTX::BI__nvvm_ldg_sc4: |
496 | case NVPTX::BI__nvvm_ldg_s: |
497 | case NVPTX::BI__nvvm_ldg_s2: |
498 | case NVPTX::BI__nvvm_ldg_s4: |
499 | case NVPTX::BI__nvvm_ldg_i: |
500 | case NVPTX::BI__nvvm_ldg_i2: |
501 | case NVPTX::BI__nvvm_ldg_i4: |
502 | case NVPTX::BI__nvvm_ldg_l: |
503 | case NVPTX::BI__nvvm_ldg_l2: |
504 | case NVPTX::BI__nvvm_ldg_ll: |
505 | case NVPTX::BI__nvvm_ldg_ll2: |
506 | case NVPTX::BI__nvvm_ldg_uc: |
507 | case NVPTX::BI__nvvm_ldg_uc2: |
508 | case NVPTX::BI__nvvm_ldg_uc4: |
509 | case NVPTX::BI__nvvm_ldg_us: |
510 | case NVPTX::BI__nvvm_ldg_us2: |
511 | case NVPTX::BI__nvvm_ldg_us4: |
512 | case NVPTX::BI__nvvm_ldg_ui: |
513 | case NVPTX::BI__nvvm_ldg_ui2: |
514 | case NVPTX::BI__nvvm_ldg_ui4: |
515 | case NVPTX::BI__nvvm_ldg_ul: |
516 | case NVPTX::BI__nvvm_ldg_ul2: |
517 | case NVPTX::BI__nvvm_ldg_ull: |
518 | case NVPTX::BI__nvvm_ldg_ull2: |
519 | case NVPTX::BI__nvvm_ldg_f: |
520 | case NVPTX::BI__nvvm_ldg_f2: |
521 | case NVPTX::BI__nvvm_ldg_f4: |
522 | case NVPTX::BI__nvvm_ldg_d: |
523 | case NVPTX::BI__nvvm_ldg_d2: |
524 | // PTX Interoperability section 2.2: "For a vector with an even number of |
525 | // elements, its alignment is set to number of elements times the alignment |
526 | // of its member: n*alignof(t)." |
527 | return MakeLdg(CGF&: *this, E); |
528 | |
529 | case NVPTX::BI__nvvm_ldu_c: |
530 | case NVPTX::BI__nvvm_ldu_sc: |
531 | case NVPTX::BI__nvvm_ldu_c2: |
532 | case NVPTX::BI__nvvm_ldu_sc2: |
533 | case NVPTX::BI__nvvm_ldu_c4: |
534 | case NVPTX::BI__nvvm_ldu_sc4: |
535 | case NVPTX::BI__nvvm_ldu_s: |
536 | case NVPTX::BI__nvvm_ldu_s2: |
537 | case NVPTX::BI__nvvm_ldu_s4: |
538 | case NVPTX::BI__nvvm_ldu_i: |
539 | case NVPTX::BI__nvvm_ldu_i2: |
540 | case NVPTX::BI__nvvm_ldu_i4: |
541 | case NVPTX::BI__nvvm_ldu_l: |
542 | case NVPTX::BI__nvvm_ldu_l2: |
543 | case NVPTX::BI__nvvm_ldu_ll: |
544 | case NVPTX::BI__nvvm_ldu_ll2: |
545 | case NVPTX::BI__nvvm_ldu_uc: |
546 | case NVPTX::BI__nvvm_ldu_uc2: |
547 | case NVPTX::BI__nvvm_ldu_uc4: |
548 | case NVPTX::BI__nvvm_ldu_us: |
549 | case NVPTX::BI__nvvm_ldu_us2: |
550 | case NVPTX::BI__nvvm_ldu_us4: |
551 | case NVPTX::BI__nvvm_ldu_ui: |
552 | case NVPTX::BI__nvvm_ldu_ui2: |
553 | case NVPTX::BI__nvvm_ldu_ui4: |
554 | case NVPTX::BI__nvvm_ldu_ul: |
555 | case NVPTX::BI__nvvm_ldu_ul2: |
556 | case NVPTX::BI__nvvm_ldu_ull: |
557 | case NVPTX::BI__nvvm_ldu_ull2: |
558 | return MakeLdu(IntrinsicID: Intrinsic::nvvm_ldu_global_i, CGF&: *this, E); |
559 | case NVPTX::BI__nvvm_ldu_f: |
560 | case NVPTX::BI__nvvm_ldu_f2: |
561 | case NVPTX::BI__nvvm_ldu_f4: |
562 | case NVPTX::BI__nvvm_ldu_d: |
563 | case NVPTX::BI__nvvm_ldu_d2: |
564 | return MakeLdu(IntrinsicID: Intrinsic::nvvm_ldu_global_f, CGF&: *this, E); |
565 | |
566 | case NVPTX::BI__nvvm_atom_cta_add_gen_i: |
567 | case NVPTX::BI__nvvm_atom_cta_add_gen_l: |
568 | case NVPTX::BI__nvvm_atom_cta_add_gen_ll: |
569 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_add_gen_i_cta, CGF&: *this, E); |
570 | case NVPTX::BI__nvvm_atom_sys_add_gen_i: |
571 | case NVPTX::BI__nvvm_atom_sys_add_gen_l: |
572 | case NVPTX::BI__nvvm_atom_sys_add_gen_ll: |
573 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_add_gen_i_sys, CGF&: *this, E); |
574 | case NVPTX::BI__nvvm_atom_cta_add_gen_f: |
575 | case NVPTX::BI__nvvm_atom_cta_add_gen_d: |
576 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_add_gen_f_cta, CGF&: *this, E); |
577 | case NVPTX::BI__nvvm_atom_sys_add_gen_f: |
578 | case NVPTX::BI__nvvm_atom_sys_add_gen_d: |
579 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_add_gen_f_sys, CGF&: *this, E); |
580 | case NVPTX::BI__nvvm_atom_cta_xchg_gen_i: |
581 | case NVPTX::BI__nvvm_atom_cta_xchg_gen_l: |
582 | case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll: |
583 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_exch_gen_i_cta, CGF&: *this, E); |
584 | case NVPTX::BI__nvvm_atom_sys_xchg_gen_i: |
585 | case NVPTX::BI__nvvm_atom_sys_xchg_gen_l: |
586 | case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll: |
587 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_exch_gen_i_sys, CGF&: *this, E); |
588 | case NVPTX::BI__nvvm_atom_cta_max_gen_i: |
589 | case NVPTX::BI__nvvm_atom_cta_max_gen_ui: |
590 | case NVPTX::BI__nvvm_atom_cta_max_gen_l: |
591 | case NVPTX::BI__nvvm_atom_cta_max_gen_ul: |
592 | case NVPTX::BI__nvvm_atom_cta_max_gen_ll: |
593 | case NVPTX::BI__nvvm_atom_cta_max_gen_ull: |
594 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_max_gen_i_cta, CGF&: *this, E); |
595 | case NVPTX::BI__nvvm_atom_sys_max_gen_i: |
596 | case NVPTX::BI__nvvm_atom_sys_max_gen_ui: |
597 | case NVPTX::BI__nvvm_atom_sys_max_gen_l: |
598 | case NVPTX::BI__nvvm_atom_sys_max_gen_ul: |
599 | case NVPTX::BI__nvvm_atom_sys_max_gen_ll: |
600 | case NVPTX::BI__nvvm_atom_sys_max_gen_ull: |
601 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_max_gen_i_sys, CGF&: *this, E); |
602 | case NVPTX::BI__nvvm_atom_cta_min_gen_i: |
603 | case NVPTX::BI__nvvm_atom_cta_min_gen_ui: |
604 | case NVPTX::BI__nvvm_atom_cta_min_gen_l: |
605 | case NVPTX::BI__nvvm_atom_cta_min_gen_ul: |
606 | case NVPTX::BI__nvvm_atom_cta_min_gen_ll: |
607 | case NVPTX::BI__nvvm_atom_cta_min_gen_ull: |
608 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_min_gen_i_cta, CGF&: *this, E); |
609 | case NVPTX::BI__nvvm_atom_sys_min_gen_i: |
610 | case NVPTX::BI__nvvm_atom_sys_min_gen_ui: |
611 | case NVPTX::BI__nvvm_atom_sys_min_gen_l: |
612 | case NVPTX::BI__nvvm_atom_sys_min_gen_ul: |
613 | case NVPTX::BI__nvvm_atom_sys_min_gen_ll: |
614 | case NVPTX::BI__nvvm_atom_sys_min_gen_ull: |
615 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_min_gen_i_sys, CGF&: *this, E); |
616 | case NVPTX::BI__nvvm_atom_cta_inc_gen_ui: |
617 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_inc_gen_i_cta, CGF&: *this, E); |
618 | case NVPTX::BI__nvvm_atom_cta_dec_gen_ui: |
619 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_dec_gen_i_cta, CGF&: *this, E); |
620 | case NVPTX::BI__nvvm_atom_sys_inc_gen_ui: |
621 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_inc_gen_i_sys, CGF&: *this, E); |
622 | case NVPTX::BI__nvvm_atom_sys_dec_gen_ui: |
623 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_dec_gen_i_sys, CGF&: *this, E); |
624 | case NVPTX::BI__nvvm_atom_cta_and_gen_i: |
625 | case NVPTX::BI__nvvm_atom_cta_and_gen_l: |
626 | case NVPTX::BI__nvvm_atom_cta_and_gen_ll: |
627 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_and_gen_i_cta, CGF&: *this, E); |
628 | case NVPTX::BI__nvvm_atom_sys_and_gen_i: |
629 | case NVPTX::BI__nvvm_atom_sys_and_gen_l: |
630 | case NVPTX::BI__nvvm_atom_sys_and_gen_ll: |
631 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_and_gen_i_sys, CGF&: *this, E); |
632 | case NVPTX::BI__nvvm_atom_cta_or_gen_i: |
633 | case NVPTX::BI__nvvm_atom_cta_or_gen_l: |
634 | case NVPTX::BI__nvvm_atom_cta_or_gen_ll: |
635 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_or_gen_i_cta, CGF&: *this, E); |
636 | case NVPTX::BI__nvvm_atom_sys_or_gen_i: |
637 | case NVPTX::BI__nvvm_atom_sys_or_gen_l: |
638 | case NVPTX::BI__nvvm_atom_sys_or_gen_ll: |
639 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_or_gen_i_sys, CGF&: *this, E); |
640 | case NVPTX::BI__nvvm_atom_cta_xor_gen_i: |
641 | case NVPTX::BI__nvvm_atom_cta_xor_gen_l: |
642 | case NVPTX::BI__nvvm_atom_cta_xor_gen_ll: |
643 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_xor_gen_i_cta, CGF&: *this, E); |
644 | case NVPTX::BI__nvvm_atom_sys_xor_gen_i: |
645 | case NVPTX::BI__nvvm_atom_sys_xor_gen_l: |
646 | case NVPTX::BI__nvvm_atom_sys_xor_gen_ll: |
647 | return MakeScopedAtomic(IntrinsicID: Intrinsic::nvvm_atomic_xor_gen_i_sys, CGF&: *this, E); |
648 | case NVPTX::BI__nvvm_atom_cta_cas_gen_us: |
649 | case NVPTX::BI__nvvm_atom_cta_cas_gen_i: |
650 | case NVPTX::BI__nvvm_atom_cta_cas_gen_l: |
651 | case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: { |
652 | Value *Ptr = EmitScalarExpr(E: E->getArg(Arg: 0)); |
653 | llvm::Type *ElemTy = |
654 | ConvertTypeForMem(T: E->getArg(Arg: 0)->getType()->getPointeeType()); |
655 | return Builder.CreateCall( |
656 | Callee: CGM.getIntrinsic( |
657 | IID: Intrinsic::nvvm_atomic_cas_gen_i_cta, Tys: {ElemTy, Ptr->getType()}), |
658 | Args: {Ptr, EmitScalarExpr(E: E->getArg(Arg: 1)), EmitScalarExpr(E: E->getArg(Arg: 2))}); |
659 | } |
660 | case NVPTX::BI__nvvm_atom_sys_cas_gen_us: |
661 | case NVPTX::BI__nvvm_atom_sys_cas_gen_i: |
662 | case NVPTX::BI__nvvm_atom_sys_cas_gen_l: |
663 | case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: { |
664 | Value *Ptr = EmitScalarExpr(E: E->getArg(Arg: 0)); |
665 | llvm::Type *ElemTy = |
666 | ConvertTypeForMem(T: E->getArg(Arg: 0)->getType()->getPointeeType()); |
667 | return Builder.CreateCall( |
668 | Callee: CGM.getIntrinsic( |
669 | IID: Intrinsic::nvvm_atomic_cas_gen_i_sys, Tys: {ElemTy, Ptr->getType()}), |
670 | Args: {Ptr, EmitScalarExpr(E: E->getArg(Arg: 1)), EmitScalarExpr(E: E->getArg(Arg: 2))}); |
671 | } |
672 | case NVPTX::BI__nvvm_match_all_sync_i32p: |
673 | case NVPTX::BI__nvvm_match_all_sync_i64p: { |
674 | Value *Mask = EmitScalarExpr(E: E->getArg(Arg: 0)); |
675 | Value *Val = EmitScalarExpr(E: E->getArg(Arg: 1)); |
676 | Address PredOutPtr = EmitPointerWithAlignment(Addr: E->getArg(Arg: 2)); |
677 | Value *ResultPair = Builder.CreateCall( |
678 | Callee: CGM.getIntrinsic(IID: BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p |
679 | ? Intrinsic::nvvm_match_all_sync_i32p |
680 | : Intrinsic::nvvm_match_all_sync_i64p), |
681 | Args: {Mask, Val}); |
682 | Value *Pred = Builder.CreateZExt(V: Builder.CreateExtractValue(Agg: ResultPair, Idxs: 1), |
683 | DestTy: PredOutPtr.getElementType()); |
684 | Builder.CreateStore(Val: Pred, Addr: PredOutPtr); |
685 | return Builder.CreateExtractValue(Agg: ResultPair, Idxs: 0); |
686 | } |
687 | |
688 | // FP MMA loads |
689 | case NVPTX::BI__hmma_m16n16k16_ld_a: |
690 | case NVPTX::BI__hmma_m16n16k16_ld_b: |
691 | case NVPTX::BI__hmma_m16n16k16_ld_c_f16: |
692 | case NVPTX::BI__hmma_m16n16k16_ld_c_f32: |
693 | case NVPTX::BI__hmma_m32n8k16_ld_a: |
694 | case NVPTX::BI__hmma_m32n8k16_ld_b: |
695 | case NVPTX::BI__hmma_m32n8k16_ld_c_f16: |
696 | case NVPTX::BI__hmma_m32n8k16_ld_c_f32: |
697 | case NVPTX::BI__hmma_m8n32k16_ld_a: |
698 | case NVPTX::BI__hmma_m8n32k16_ld_b: |
699 | case NVPTX::BI__hmma_m8n32k16_ld_c_f16: |
700 | case NVPTX::BI__hmma_m8n32k16_ld_c_f32: |
701 | // Integer MMA loads. |
702 | case NVPTX::BI__imma_m16n16k16_ld_a_s8: |
703 | case NVPTX::BI__imma_m16n16k16_ld_a_u8: |
704 | case NVPTX::BI__imma_m16n16k16_ld_b_s8: |
705 | case NVPTX::BI__imma_m16n16k16_ld_b_u8: |
706 | case NVPTX::BI__imma_m16n16k16_ld_c: |
707 | case NVPTX::BI__imma_m32n8k16_ld_a_s8: |
708 | case NVPTX::BI__imma_m32n8k16_ld_a_u8: |
709 | case NVPTX::BI__imma_m32n8k16_ld_b_s8: |
710 | case NVPTX::BI__imma_m32n8k16_ld_b_u8: |
711 | case NVPTX::BI__imma_m32n8k16_ld_c: |
712 | case NVPTX::BI__imma_m8n32k16_ld_a_s8: |
713 | case NVPTX::BI__imma_m8n32k16_ld_a_u8: |
714 | case NVPTX::BI__imma_m8n32k16_ld_b_s8: |
715 | case NVPTX::BI__imma_m8n32k16_ld_b_u8: |
716 | case NVPTX::BI__imma_m8n32k16_ld_c: |
717 | // Sub-integer MMA loads. |
718 | case NVPTX::BI__imma_m8n8k32_ld_a_s4: |
719 | case NVPTX::BI__imma_m8n8k32_ld_a_u4: |
720 | case NVPTX::BI__imma_m8n8k32_ld_b_s4: |
721 | case NVPTX::BI__imma_m8n8k32_ld_b_u4: |
722 | case NVPTX::BI__imma_m8n8k32_ld_c: |
723 | case NVPTX::BI__bmma_m8n8k128_ld_a_b1: |
724 | case NVPTX::BI__bmma_m8n8k128_ld_b_b1: |
725 | case NVPTX::BI__bmma_m8n8k128_ld_c: |
726 | // Double MMA loads. |
727 | case NVPTX::BI__dmma_m8n8k4_ld_a: |
728 | case NVPTX::BI__dmma_m8n8k4_ld_b: |
729 | case NVPTX::BI__dmma_m8n8k4_ld_c: |
730 | // Alternate float MMA loads. |
731 | case NVPTX::BI__mma_bf16_m16n16k16_ld_a: |
732 | case NVPTX::BI__mma_bf16_m16n16k16_ld_b: |
733 | case NVPTX::BI__mma_bf16_m8n32k16_ld_a: |
734 | case NVPTX::BI__mma_bf16_m8n32k16_ld_b: |
735 | case NVPTX::BI__mma_bf16_m32n8k16_ld_a: |
736 | case NVPTX::BI__mma_bf16_m32n8k16_ld_b: |
737 | case NVPTX::BI__mma_tf32_m16n16k8_ld_a: |
738 | case NVPTX::BI__mma_tf32_m16n16k8_ld_b: |
739 | case NVPTX::BI__mma_tf32_m16n16k8_ld_c: { |
740 | Address Dst = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0)); |
741 | Value *Src = EmitScalarExpr(E: E->getArg(Arg: 1)); |
742 | Value *Ldm = EmitScalarExpr(E: E->getArg(Arg: 2)); |
743 | std::optional<llvm::APSInt> isColMajorArg = |
744 | E->getArg(Arg: 3)->getIntegerConstantExpr(Ctx: getContext()); |
745 | if (!isColMajorArg) |
746 | return nullptr; |
747 | bool isColMajor = isColMajorArg->getSExtValue(); |
748 | NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID); |
749 | unsigned IID = isColMajor ? II.IID_col : II.IID_row; |
750 | if (IID == 0) |
751 | return nullptr; |
752 | |
753 | Value *Result = |
754 | Builder.CreateCall(Callee: CGM.getIntrinsic(IID, Tys: Src->getType()), Args: {Src, Ldm}); |
755 | |
756 | // Save returned values. |
757 | assert(II.NumResults); |
758 | if (II.NumResults == 1) { |
759 | Builder.CreateAlignedStore(Val: Result, Addr: Dst.emitRawPointer(CGF&: *this), |
760 | Align: CharUnits::fromQuantity(Quantity: 4)); |
761 | } else { |
762 | for (unsigned i = 0; i < II.NumResults; ++i) { |
763 | Builder.CreateAlignedStore( |
764 | Val: Builder.CreateBitCast(V: Builder.CreateExtractValue(Agg: Result, Idxs: i), |
765 | DestTy: Dst.getElementType()), |
766 | Addr: Builder.CreateGEP(Ty: Dst.getElementType(), Ptr: Dst.emitRawPointer(CGF&: *this), |
767 | IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)), |
768 | Align: CharUnits::fromQuantity(Quantity: 4)); |
769 | } |
770 | } |
771 | return Result; |
772 | } |
773 | |
774 | case NVPTX::BI__hmma_m16n16k16_st_c_f16: |
775 | case NVPTX::BI__hmma_m16n16k16_st_c_f32: |
776 | case NVPTX::BI__hmma_m32n8k16_st_c_f16: |
777 | case NVPTX::BI__hmma_m32n8k16_st_c_f32: |
778 | case NVPTX::BI__hmma_m8n32k16_st_c_f16: |
779 | case NVPTX::BI__hmma_m8n32k16_st_c_f32: |
780 | case NVPTX::BI__imma_m16n16k16_st_c_i32: |
781 | case NVPTX::BI__imma_m32n8k16_st_c_i32: |
782 | case NVPTX::BI__imma_m8n32k16_st_c_i32: |
783 | case NVPTX::BI__imma_m8n8k32_st_c_i32: |
784 | case NVPTX::BI__bmma_m8n8k128_st_c_i32: |
785 | case NVPTX::BI__dmma_m8n8k4_st_c_f64: |
786 | case NVPTX::BI__mma_m16n16k8_st_c_f32: { |
787 | Value *Dst = EmitScalarExpr(E: E->getArg(Arg: 0)); |
788 | Address Src = EmitPointerWithAlignment(Addr: E->getArg(Arg: 1)); |
789 | Value *Ldm = EmitScalarExpr(E: E->getArg(Arg: 2)); |
790 | std::optional<llvm::APSInt> isColMajorArg = |
791 | E->getArg(Arg: 3)->getIntegerConstantExpr(Ctx: getContext()); |
792 | if (!isColMajorArg) |
793 | return nullptr; |
794 | bool isColMajor = isColMajorArg->getSExtValue(); |
795 | NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID); |
796 | unsigned IID = isColMajor ? II.IID_col : II.IID_row; |
797 | if (IID == 0) |
798 | return nullptr; |
799 | Function *Intrinsic = |
800 | CGM.getIntrinsic(IID, Tys: Dst->getType()); |
801 | llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(i: 1); |
802 | SmallVector<Value *, 10> Values = {Dst}; |
803 | for (unsigned i = 0; i < II.NumResults; ++i) { |
804 | Value *V = Builder.CreateAlignedLoad( |
805 | Ty: Src.getElementType(), |
806 | Addr: Builder.CreateGEP(Ty: Src.getElementType(), Ptr: Src.emitRawPointer(CGF&: *this), |
807 | IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)), |
808 | Align: CharUnits::fromQuantity(Quantity: 4)); |
809 | Values.push_back(Elt: Builder.CreateBitCast(V, DestTy: ParamType)); |
810 | } |
811 | Values.push_back(Elt: Ldm); |
812 | Value *Result = Builder.CreateCall(Callee: Intrinsic, Args: Values); |
813 | return Result; |
814 | } |
815 | |
816 | // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) --> |
817 | // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf> |
818 | case NVPTX::BI__hmma_m16n16k16_mma_f16f16: |
819 | case NVPTX::BI__hmma_m16n16k16_mma_f32f16: |
820 | case NVPTX::BI__hmma_m16n16k16_mma_f32f32: |
821 | case NVPTX::BI__hmma_m16n16k16_mma_f16f32: |
822 | case NVPTX::BI__hmma_m32n8k16_mma_f16f16: |
823 | case NVPTX::BI__hmma_m32n8k16_mma_f32f16: |
824 | case NVPTX::BI__hmma_m32n8k16_mma_f32f32: |
825 | case NVPTX::BI__hmma_m32n8k16_mma_f16f32: |
826 | case NVPTX::BI__hmma_m8n32k16_mma_f16f16: |
827 | case NVPTX::BI__hmma_m8n32k16_mma_f32f16: |
828 | case NVPTX::BI__hmma_m8n32k16_mma_f32f32: |
829 | case NVPTX::BI__hmma_m8n32k16_mma_f16f32: |
830 | case NVPTX::BI__imma_m16n16k16_mma_s8: |
831 | case NVPTX::BI__imma_m16n16k16_mma_u8: |
832 | case NVPTX::BI__imma_m32n8k16_mma_s8: |
833 | case NVPTX::BI__imma_m32n8k16_mma_u8: |
834 | case NVPTX::BI__imma_m8n32k16_mma_s8: |
835 | case NVPTX::BI__imma_m8n32k16_mma_u8: |
836 | case NVPTX::BI__imma_m8n8k32_mma_s4: |
837 | case NVPTX::BI__imma_m8n8k32_mma_u4: |
838 | case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1: |
839 | case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1: |
840 | case NVPTX::BI__dmma_m8n8k4_mma_f64: |
841 | case NVPTX::BI__mma_bf16_m16n16k16_mma_f32: |
842 | case NVPTX::BI__mma_bf16_m8n32k16_mma_f32: |
843 | case NVPTX::BI__mma_bf16_m32n8k16_mma_f32: |
844 | case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: { |
845 | Address Dst = EmitPointerWithAlignment(Addr: E->getArg(Arg: 0)); |
846 | Address SrcA = EmitPointerWithAlignment(Addr: E->getArg(Arg: 1)); |
847 | Address SrcB = EmitPointerWithAlignment(Addr: E->getArg(Arg: 2)); |
848 | Address SrcC = EmitPointerWithAlignment(Addr: E->getArg(Arg: 3)); |
849 | std::optional<llvm::APSInt> LayoutArg = |
850 | E->getArg(Arg: 4)->getIntegerConstantExpr(Ctx: getContext()); |
851 | if (!LayoutArg) |
852 | return nullptr; |
853 | int Layout = LayoutArg->getSExtValue(); |
854 | if (Layout < 0 || Layout > 3) |
855 | return nullptr; |
856 | llvm::APSInt SatfArg; |
857 | if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 || |
858 | BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1) |
859 | SatfArg = 0; // .b1 does not have satf argument. |
860 | else if (std::optional<llvm::APSInt> OptSatfArg = |
861 | E->getArg(Arg: 5)->getIntegerConstantExpr(Ctx: getContext())) |
862 | SatfArg = *OptSatfArg; |
863 | else |
864 | return nullptr; |
865 | bool Satf = SatfArg.getSExtValue(); |
866 | NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID); |
867 | unsigned IID = MI.getMMAIntrinsic(Layout, Satf); |
868 | if (IID == 0) // Unsupported combination of Layout/Satf. |
869 | return nullptr; |
870 | |
871 | SmallVector<Value *, 24> Values; |
872 | Function *Intrinsic = CGM.getIntrinsic(IID); |
873 | llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(i: 0); |
874 | // Load A |
875 | for (unsigned i = 0; i < MI.NumEltsA; ++i) { |
876 | Value *V = Builder.CreateAlignedLoad( |
877 | Ty: SrcA.getElementType(), |
878 | Addr: Builder.CreateGEP(Ty: SrcA.getElementType(), Ptr: SrcA.emitRawPointer(CGF&: *this), |
879 | IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)), |
880 | Align: CharUnits::fromQuantity(Quantity: 4)); |
881 | Values.push_back(Elt: Builder.CreateBitCast(V, DestTy: AType)); |
882 | } |
883 | // Load B |
884 | llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(i: MI.NumEltsA); |
885 | for (unsigned i = 0; i < MI.NumEltsB; ++i) { |
886 | Value *V = Builder.CreateAlignedLoad( |
887 | Ty: SrcB.getElementType(), |
888 | Addr: Builder.CreateGEP(Ty: SrcB.getElementType(), Ptr: SrcB.emitRawPointer(CGF&: *this), |
889 | IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)), |
890 | Align: CharUnits::fromQuantity(Quantity: 4)); |
891 | Values.push_back(Elt: Builder.CreateBitCast(V, DestTy: BType)); |
892 | } |
893 | // Load C |
894 | llvm::Type *CType = |
895 | Intrinsic->getFunctionType()->getParamType(i: MI.NumEltsA + MI.NumEltsB); |
896 | for (unsigned i = 0; i < MI.NumEltsC; ++i) { |
897 | Value *V = Builder.CreateAlignedLoad( |
898 | Ty: SrcC.getElementType(), |
899 | Addr: Builder.CreateGEP(Ty: SrcC.getElementType(), Ptr: SrcC.emitRawPointer(CGF&: *this), |
900 | IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)), |
901 | Align: CharUnits::fromQuantity(Quantity: 4)); |
902 | Values.push_back(Elt: Builder.CreateBitCast(V, DestTy: CType)); |
903 | } |
904 | Value *Result = Builder.CreateCall(Callee: Intrinsic, Args: Values); |
905 | llvm::Type *DType = Dst.getElementType(); |
906 | for (unsigned i = 0; i < MI.NumEltsD; ++i) |
907 | Builder.CreateAlignedStore( |
908 | Val: Builder.CreateBitCast(V: Builder.CreateExtractValue(Agg: Result, Idxs: i), DestTy: DType), |
909 | Addr: Builder.CreateGEP(Ty: Dst.getElementType(), Ptr: Dst.emitRawPointer(CGF&: *this), |
910 | IdxList: llvm::ConstantInt::get(Ty: IntTy, V: i)), |
911 | Align: CharUnits::fromQuantity(Quantity: 4)); |
912 | return Result; |
913 | } |
914 | // The following builtins require half type support |
915 | case NVPTX::BI__nvvm_ex2_approx_f16: |
916 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, CGF&: *this); |
917 | case NVPTX::BI__nvvm_ex2_approx_f16x2: |
918 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, CGF&: *this); |
919 | case NVPTX::BI__nvvm_ff2f16x2_rn: |
920 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, CGF&: *this); |
921 | case NVPTX::BI__nvvm_ff2f16x2_rn_relu: |
922 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, CGF&: *this); |
923 | case NVPTX::BI__nvvm_ff2f16x2_rz: |
924 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, CGF&: *this); |
925 | case NVPTX::BI__nvvm_ff2f16x2_rz_relu: |
926 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, CGF&: *this); |
927 | case NVPTX::BI__nvvm_fma_rn_f16: |
928 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, CGF&: *this); |
929 | case NVPTX::BI__nvvm_fma_rn_f16x2: |
930 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, CGF&: *this); |
931 | case NVPTX::BI__nvvm_fma_rn_ftz_f16: |
932 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, CGF&: *this); |
933 | case NVPTX::BI__nvvm_fma_rn_ftz_f16x2: |
934 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, CGF&: *this); |
935 | case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16: |
936 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E, |
937 | CGF&: *this); |
938 | case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2: |
939 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E, |
940 | CGF&: *this); |
941 | case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16: |
942 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E, |
943 | CGF&: *this); |
944 | case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2: |
945 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E, |
946 | CGF&: *this); |
947 | case NVPTX::BI__nvvm_fma_rn_relu_f16: |
948 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, CGF&: *this); |
949 | case NVPTX::BI__nvvm_fma_rn_relu_f16x2: |
950 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, CGF&: *this); |
951 | case NVPTX::BI__nvvm_fma_rn_sat_f16: |
952 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, CGF&: *this); |
953 | case NVPTX::BI__nvvm_fma_rn_sat_f16x2: |
954 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, CGF&: *this); |
955 | case NVPTX::BI__nvvm_fmax_f16: |
956 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_f16, BuiltinID, E, CGF&: *this); |
957 | case NVPTX::BI__nvvm_fmax_f16x2: |
958 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, CGF&: *this); |
959 | case NVPTX::BI__nvvm_fmax_ftz_f16: |
960 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, CGF&: *this); |
961 | case NVPTX::BI__nvvm_fmax_ftz_f16x2: |
962 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, CGF&: *this); |
963 | case NVPTX::BI__nvvm_fmax_ftz_nan_f16: |
964 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, CGF&: *this); |
965 | case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2: |
966 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E, |
967 | CGF&: *this); |
968 | case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16: |
969 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID, |
970 | E, CGF&: *this); |
971 | case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2: |
972 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2, |
973 | BuiltinID, E, CGF&: *this); |
974 | case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16: |
975 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E, |
976 | CGF&: *this); |
977 | case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2: |
978 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID, |
979 | E, CGF&: *this); |
980 | case NVPTX::BI__nvvm_fmax_nan_f16: |
981 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, CGF&: *this); |
982 | case NVPTX::BI__nvvm_fmax_nan_f16x2: |
983 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, CGF&: *this); |
984 | case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16: |
985 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E, |
986 | CGF&: *this); |
987 | case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2: |
988 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID, |
989 | E, CGF&: *this); |
990 | case NVPTX::BI__nvvm_fmax_xorsign_abs_f16: |
991 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E, |
992 | CGF&: *this); |
993 | case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2: |
994 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E, |
995 | CGF&: *this); |
996 | case NVPTX::BI__nvvm_fmin_f16: |
997 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_f16, BuiltinID, E, CGF&: *this); |
998 | case NVPTX::BI__nvvm_fmin_f16x2: |
999 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, CGF&: *this); |
1000 | case NVPTX::BI__nvvm_fmin_ftz_f16: |
1001 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, CGF&: *this); |
1002 | case NVPTX::BI__nvvm_fmin_ftz_f16x2: |
1003 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, CGF&: *this); |
1004 | case NVPTX::BI__nvvm_fmin_ftz_nan_f16: |
1005 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, CGF&: *this); |
1006 | case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2: |
1007 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E, |
1008 | CGF&: *this); |
1009 | case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16: |
1010 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID, |
1011 | E, CGF&: *this); |
1012 | case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2: |
1013 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2, |
1014 | BuiltinID, E, CGF&: *this); |
1015 | case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16: |
1016 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E, |
1017 | CGF&: *this); |
1018 | case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2: |
1019 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID, |
1020 | E, CGF&: *this); |
1021 | case NVPTX::BI__nvvm_fmin_nan_f16: |
1022 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, CGF&: *this); |
1023 | case NVPTX::BI__nvvm_fmin_nan_f16x2: |
1024 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, CGF&: *this); |
1025 | case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16: |
1026 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E, |
1027 | CGF&: *this); |
1028 | case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2: |
1029 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID, |
1030 | E, CGF&: *this); |
1031 | case NVPTX::BI__nvvm_fmin_xorsign_abs_f16: |
1032 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E, |
1033 | CGF&: *this); |
1034 | case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2: |
1035 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E, |
1036 | CGF&: *this); |
1037 | case NVPTX::BI__nvvm_fabs_f: |
1038 | case NVPTX::BI__nvvm_abs_bf16: |
1039 | case NVPTX::BI__nvvm_abs_bf16x2: |
1040 | case NVPTX::BI__nvvm_fabs_f16: |
1041 | case NVPTX::BI__nvvm_fabs_f16x2: |
1042 | return Builder.CreateUnaryIntrinsic(ID: Intrinsic::nvvm_fabs, |
1043 | V: EmitScalarExpr(E: E->getArg(Arg: 0))); |
1044 | case NVPTX::BI__nvvm_fabs_ftz_f: |
1045 | case NVPTX::BI__nvvm_fabs_ftz_f16: |
1046 | case NVPTX::BI__nvvm_fabs_ftz_f16x2: |
1047 | return Builder.CreateUnaryIntrinsic(ID: Intrinsic::nvvm_fabs_ftz, |
1048 | V: EmitScalarExpr(E: E->getArg(Arg: 0))); |
1049 | case NVPTX::BI__nvvm_fabs_d: |
1050 | return Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, |
1051 | V: EmitScalarExpr(E: E->getArg(Arg: 0))); |
1052 | case NVPTX::BI__nvvm_ldg_h: |
1053 | case NVPTX::BI__nvvm_ldg_h2: |
1054 | return MakeHalfType(IntrinsicID: Intrinsic::not_intrinsic, BuiltinID, E, CGF&: *this); |
1055 | case NVPTX::BI__nvvm_ldu_h: |
1056 | case NVPTX::BI__nvvm_ldu_h2: |
1057 | return MakeHalfType(IntrinsicID: Intrinsic::nvvm_ldu_global_f, BuiltinID, E, CGF&: *this); |
1058 | case NVPTX::BI__nvvm_cp_async_ca_shared_global_4: |
1059 | return MakeCpAsync(IntrinsicID: Intrinsic::nvvm_cp_async_ca_shared_global_4, |
1060 | IntrinsicIDS: Intrinsic::nvvm_cp_async_ca_shared_global_4_s, CGF&: *this, E, |
1061 | SrcSize: 4); |
1062 | case NVPTX::BI__nvvm_cp_async_ca_shared_global_8: |
1063 | return MakeCpAsync(IntrinsicID: Intrinsic::nvvm_cp_async_ca_shared_global_8, |
1064 | IntrinsicIDS: Intrinsic::nvvm_cp_async_ca_shared_global_8_s, CGF&: *this, E, |
1065 | SrcSize: 8); |
1066 | case NVPTX::BI__nvvm_cp_async_ca_shared_global_16: |
1067 | return MakeCpAsync(IntrinsicID: Intrinsic::nvvm_cp_async_ca_shared_global_16, |
1068 | IntrinsicIDS: Intrinsic::nvvm_cp_async_ca_shared_global_16_s, CGF&: *this, E, |
1069 | SrcSize: 16); |
1070 | case NVPTX::BI__nvvm_cp_async_cg_shared_global_16: |
1071 | return MakeCpAsync(IntrinsicID: Intrinsic::nvvm_cp_async_cg_shared_global_16, |
1072 | IntrinsicIDS: Intrinsic::nvvm_cp_async_cg_shared_global_16_s, CGF&: *this, E, |
1073 | SrcSize: 16); |
1074 | case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x: |
1075 | return Builder.CreateCall( |
1076 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_clusterid_x)); |
1077 | case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y: |
1078 | return Builder.CreateCall( |
1079 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_clusterid_y)); |
1080 | case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z: |
1081 | return Builder.CreateCall( |
1082 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_clusterid_z)); |
1083 | case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w: |
1084 | return Builder.CreateCall( |
1085 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_clusterid_w)); |
1086 | case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x: |
1087 | return Builder.CreateCall( |
1088 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_nclusterid_x)); |
1089 | case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y: |
1090 | return Builder.CreateCall( |
1091 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_nclusterid_y)); |
1092 | case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z: |
1093 | return Builder.CreateCall( |
1094 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_nclusterid_z)); |
1095 | case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w: |
1096 | return Builder.CreateCall( |
1097 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_nclusterid_w)); |
1098 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x: |
1099 | return Builder.CreateCall( |
1100 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x)); |
1101 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y: |
1102 | return Builder.CreateCall( |
1103 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y)); |
1104 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z: |
1105 | return Builder.CreateCall( |
1106 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z)); |
1107 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w: |
1108 | return Builder.CreateCall( |
1109 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w)); |
1110 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x: |
1111 | return Builder.CreateCall( |
1112 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x)); |
1113 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y: |
1114 | return Builder.CreateCall( |
1115 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y)); |
1116 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z: |
1117 | return Builder.CreateCall( |
1118 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z)); |
1119 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w: |
1120 | return Builder.CreateCall( |
1121 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w)); |
1122 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank: |
1123 | return Builder.CreateCall( |
1124 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank)); |
1125 | case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank: |
1126 | return Builder.CreateCall( |
1127 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank)); |
1128 | case NVPTX::BI__nvvm_is_explicit_cluster: |
1129 | return Builder.CreateCall( |
1130 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_is_explicit_cluster)); |
1131 | case NVPTX::BI__nvvm_isspacep_shared_cluster: |
1132 | return Builder.CreateCall( |
1133 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_isspacep_shared_cluster), |
1134 | Args: EmitScalarExpr(E: E->getArg(Arg: 0))); |
1135 | case NVPTX::BI__nvvm_mapa: |
1136 | return Builder.CreateCall( |
1137 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_mapa), |
1138 | Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1))}); |
1139 | case NVPTX::BI__nvvm_mapa_shared_cluster: |
1140 | return Builder.CreateCall( |
1141 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_mapa_shared_cluster), |
1142 | Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1))}); |
1143 | case NVPTX::BI__nvvm_getctarank: |
1144 | return Builder.CreateCall( |
1145 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_getctarank), |
1146 | Args: EmitScalarExpr(E: E->getArg(Arg: 0))); |
1147 | case NVPTX::BI__nvvm_getctarank_shared_cluster: |
1148 | return Builder.CreateCall( |
1149 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_getctarank_shared_cluster), |
1150 | Args: EmitScalarExpr(E: E->getArg(Arg: 0))); |
1151 | case NVPTX::BI__nvvm_barrier_cluster_arrive: |
1152 | return Builder.CreateCall( |
1153 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cluster_arrive)); |
1154 | case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed: |
1155 | return Builder.CreateCall( |
1156 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cluster_arrive_relaxed)); |
1157 | case NVPTX::BI__nvvm_barrier_cluster_wait: |
1158 | return Builder.CreateCall( |
1159 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cluster_wait)); |
1160 | case NVPTX::BI__nvvm_fence_sc_cluster: |
1161 | return Builder.CreateCall( |
1162 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_fence_sc_cluster)); |
1163 | case NVPTX::BI__nvvm_bar_sync: |
1164 | return Builder.CreateCall( |
1165 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cta_sync_aligned_all), |
1166 | Args: EmitScalarExpr(E: E->getArg(Arg: 0))); |
1167 | case NVPTX::BI__syncthreads: |
1168 | return Builder.CreateCall( |
1169 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cta_sync_aligned_all), |
1170 | Args: Builder.getInt32(C: 0)); |
1171 | case NVPTX::BI__nvvm_barrier_sync: |
1172 | return Builder.CreateCall( |
1173 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cta_sync_all), |
1174 | Args: EmitScalarExpr(E: E->getArg(Arg: 0))); |
1175 | case NVPTX::BI__nvvm_barrier_sync_cnt: |
1176 | return Builder.CreateCall( |
1177 | Callee: CGM.getIntrinsic(IID: Intrinsic::nvvm_barrier_cta_sync_count), |
1178 | Args: {EmitScalarExpr(E: E->getArg(Arg: 0)), EmitScalarExpr(E: E->getArg(Arg: 1))}); |
1179 | default: |
1180 | return nullptr; |
1181 | } |
1182 | } |
1183 | |