1//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the NVPTX specific subclass of TargetSubtarget.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15
16#include "NVPTX.h"
17#include "NVPTXFrameLowering.h"
18#include "NVPTXISelLowering.h"
19#include "NVPTXInstrInfo.h"
20#include "NVPTXRegisterInfo.h"
21#include "llvm/CodeGen/TargetSubtargetInfo.h"
22#include "llvm/IR/DataLayout.h"
23#include "llvm/IR/NVVMIntrinsicUtils.h"
24#include "llvm/Support/NVPTXAddrSpace.h"
25#include <string>
26
27#define GET_SUBTARGETINFO_HEADER
28#include "NVPTXGenSubtargetInfo.inc"
29
30namespace llvm {
31
32// FullSmVersion encoding: SM * 10 + ArchSuffixOffset
33// ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
34// e.g. sm_100 -> 1000, sm_100f -> 1002, sm_100a -> 1003
35
36class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
37 virtual void anchor();
38 std::string TargetName;
39
40 // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
41 unsigned PTXVersion;
42
43 // FullSmVersion encoding: SM * 10 + ArchSuffixOffset
44 // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
45 // e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002
46 unsigned int FullSmVersion;
47
48 // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
49 // FullSmVersion.
50 unsigned int SmVersion;
51
52 NVPTXInstrInfo InstrInfo;
53 NVPTXTargetLowering TLInfo;
54 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
55
56 // NVPTX does not have any call stack frame, but need a NVPTX specific
57 // FrameLowering class because TargetFrameLowering is abstract.
58 NVPTXFrameLowering FrameLowering;
59
60public:
61 /// This constructor initializes the data members to match that
62 /// of the specified module.
63 ///
64 NVPTXSubtarget(const Triple &TT, const std::string &CPU,
65 const std::string &FS, const NVPTXTargetMachine &TM);
66
67 ~NVPTXSubtarget() override;
68
69 const TargetFrameLowering *getFrameLowering() const override {
70 return &FrameLowering;
71 }
72 const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
73 const NVPTXRegisterInfo *getRegisterInfo() const override {
74 return &InstrInfo.getRegisterInfo();
75 }
76 const NVPTXTargetLowering *getTargetLowering() const override {
77 return &TLInfo;
78 }
79
80 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
81
82 // Checks PTX version and family-specific and architecture-specific SM
83 // versions. For example, sm_100{f/a} and any future variants in the same
84 // family will match for any PTX version greater than or equal to
85 // `PTXVersion`.
86 bool hasPTXWithFamilySMs(unsigned PTXVersion,
87 ArrayRef<unsigned> SMVersions) const;
88 // Checks PTX version and architecture-specific SM versions.
89 // For example, sm_100{a} will match for any PTX version greater than or equal
90 // to `PTXVersion`.
91 bool hasPTXWithAccelSMs(unsigned PTXVersion,
92 ArrayRef<unsigned> SMVersions) const;
93
94 bool has256BitVectorLoadStore(unsigned AS) const {
95 return SmVersion >= 100 && PTXVersion >= 88 &&
96 AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
97 }
98 bool hasUsedBytesMaskPragma() const {
99 return SmVersion >= 50 && PTXVersion >= 83;
100 }
101 bool hasAtomAddF64() const { return SmVersion >= 60; }
102 bool hasAtomScope() const { return SmVersion >= 60; }
103 bool hasAtomBitwise64() const { return SmVersion >= 32; }
104 bool hasAtomMinMax64() const { return SmVersion >= 32; }
105 bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
106 bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
107 bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
108 bool hasLDG() const { return SmVersion >= 32; }
109 bool hasHWROT32() const { return SmVersion >= 32; }
110 bool hasBrx() const { return SmVersion >= 30 && PTXVersion >= 60; }
111 bool hasFP16Math() const { return SmVersion >= 53; }
112 bool hasBF16Math() const { return SmVersion >= 80; }
113 bool allowFP16Math() const;
114 bool hasMaskOperator() const { return PTXVersion >= 71; }
115 bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
116 // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
117 // release, acq_rel, sc) ?
118 bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
119 // Does SM & PTX support .acquire and .release qualifiers for fence?
120 bool hasSplitAcquireAndReleaseFences() const {
121 return SmVersion >= 90 && PTXVersion >= 86;
122 }
123 // Does SM & PTX support atomic relaxed MMIO operations ?
124 bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
125 bool hasDotInstructions() const {
126 return SmVersion >= 61 && PTXVersion >= 50;
127 }
128
129 // Checks following instructions support:
130 // - tcgen05.ld/st
131 // - tcgen05.alloc/dealloc/relinquish
132 // - tcgen05.cp
133 // - tcgen05.fence/wait
134 // - tcgen05.commit
135 // - tcgen05.mma
136 bool hasTcgen05InstSupport() const {
137 // sm_101 renamed to sm_110 in PTX 9.0
138 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110}) ||
139 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101}) ||
140 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
141 }
142
143 // Checks tcgen05.shift instruction support.
144 bool hasTcgen05ShiftSupport() const {
145 // sm_101 renamed to sm_110 in PTX 9.0
146 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
147 hasPTXWithAccelSMs(PTXVersion: 88, SMVersions: {100, 101, 103}) ||
148 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
149 }
150
151 bool hasTcgen05MMAScaleInputDImm() const {
152 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100}) || hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100});
153 }
154
155 bool hasTcgen05MMAI8Kind() const {
156 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110}) ||
157 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
158 }
159
160 bool hasTcgen05MMASparseMxf4nvf4() const {
161 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
162 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 103});
163 }
164
165 bool hasTcgen05MMASparseMxf4() const {
166 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
167 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 103});
168 }
169
170 bool hasTcgen05LdRedSupport() const {
171 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {110, 103}) ||
172 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {101, 103});
173 }
174
175 bool hasReduxSyncF32() const {
176 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100}) || hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100});
177 }
178
179 bool hasMMABlockScale() const {
180 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {120}) || hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {120});
181 }
182
183 bool hasMMASparseBlockScaleF4() const {
184 return hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {120, 121});
185 }
186
187 bool hasMMAWithMXF4NVF4Scale4xE8M0() const {
188 return hasPTXWithFamilySMs(PTXVersion: 91, SMVersions: {120});
189 }
190
191 bool hasMMASparseWithMXF4NVF4Scale4xE8M0() const {
192 return hasPTXWithAccelSMs(PTXVersion: 91, SMVersions: {120, 121});
193 }
194
195 // f32x2 instructions in Blackwell family
196 bool hasF32x2Instructions() const;
197
198 // Checks support for following in TMA:
199 // - cta_group::1/2 support
200 // - im2col_w/w_128 mode support
201 // - tile_gather4 mode support
202 // - tile_scatter4 mode support
203 bool hasTMABlackwellSupport() const {
204 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110}) ||
205 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101}) ||
206 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
207 }
208
209 // Checks support for conversions involving e4m3x2 and e5m2x2.
210 bool hasFP8ConversionSupport() const {
211 if (PTXVersion >= 81)
212 return SmVersion >= 89;
213
214 if (PTXVersion >= 78)
215 return SmVersion >= 90;
216
217 return false;
218 }
219
220 // Checks support for conversions involving the following types:
221 // - e2m3x2/e3m2x2
222 // - e2m1x2
223 // - ue8m0x2
224 bool hasNarrowFPConversionSupport() const {
225 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
226 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
227 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
228 }
229
230 // Checks support for conversions involving the following types:
231 // - bf16x2 -> f8x2
232 // - f16x2 -> f6x2
233 // - bf16x2 -> f6x2
234 // - f16x2 -> f4x2
235 // - bf16x2 -> f4x2
236 bool hasFP16X2ToNarrowFPConversionSupport() const {
237 return hasPTXWithFamilySMs(PTXVersion: 91, SMVersions: {100, 110, 120});
238 }
239
240 bool hasS2F6X2ConversionSupport() const {
241 return hasPTXWithAccelSMs(PTXVersion: 91, SMVersions: {100, 103, 110, 120, 121});
242 }
243
244 // Checks support for conversions from narrow FP types to bf16x2.
245 bool hasNarrowFPToBF16x2ConversionSupport() const {
246 return hasPTXWithFamilySMs(PTXVersion: 92, SMVersions: {100, 110, 120});
247 }
248
249 bool hasTensormapReplaceSupport() const {
250 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {90, 100, 110, 120}) ||
251 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {90, 100, 101, 120}) ||
252 hasPTXWithAccelSMs(PTXVersion: 83, SMVersions: {90, 100, 101, 120});
253 }
254
255 bool hasTensormapReplaceElemtypeSupport(unsigned value) const {
256 if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16))
257 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
258 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
259 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 120});
260
261 return hasTensormapReplaceSupport();
262 }
263
264 bool hasTensormapReplaceSwizzleAtomicitySupport() const {
265 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
266 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
267 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 120});
268 }
269
270 bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const {
271 if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B))
272 return hasPTXWithAccelSMs(PTXVersion: 88, SMVersions: {103});
273
274 return hasTensormapReplaceSupport();
275 }
276
277 bool hasClusterLaunchControlTryCancelMulticastSupport() const {
278 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
279 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
280 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
281 }
282
283 bool hasSetMaxNRegSupport() const {
284 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
285 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
286 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120}) ||
287 hasPTXWithAccelSMs(PTXVersion: 80, SMVersions: {90});
288 }
289
290 bool hasLdStmatrixBlackwellSupport() const {
291 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
292 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
293 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
294 }
295
296 // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
297 // terminates a basic block. Instead, it would assume that control flow
298 // continued to the next instruction. The next instruction could be in the
299 // block that's lexically below it. This would lead to a phantom CFG edges
300 // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
301 // PTX ISA versions 8.3+ we can confidently say that the bug will not be
302 // present.
303 bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
304 bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
305 bool hasConvertWithStochasticRounding() const {
306 return hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 103});
307 }
308 unsigned int getFullSmVersion() const { return FullSmVersion; }
309 unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
310 unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
311 // GPUs with "a" suffix have architecture-accelerated features that are
312 // supported on the specified architecture only, hence such targets do not
313 // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
314 // such GPU variants from the base GPU architecture.
315 // - false represents non-accelerated architecture.
316 // - true represents architecture-accelerated variant.
317 bool hasArchAccelFeatures() const {
318 return (getFullSmVersion() & 1) && PTXVersion >= 80;
319 }
320 // GPUs with 'f' suffix have architecture-accelerated features which are
321 // portable across all future architectures under same SM major. For example,
322 // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
323 // - false represents non-family-specific architecture.
324 // - true represents family-specific variant.
325 bool hasFamilySpecificFeatures() const {
326 return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
327 : hasArchAccelFeatures();
328 }
329 // If the user did not provide a target we default to the `sm_75` target.
330 std::string getTargetName() const {
331 return TargetName.empty() ? "sm_75" : TargetName;
332 }
333 bool hasTargetName() const { return !TargetName.empty(); }
334
335 bool hasNativeBF16Support(int Opcode) const;
336
337 // Get maximum value of required alignments among the supported data types.
338 // From the PTX ISA doc, section 8.2.3:
339 // The memory consistency model relates operations executed on memory
340 // locations with scalar data-types, which have a maximum size and alignment
341 // of 64 bits. Memory operations with a vector data-type are modelled as a
342 // set of equivalent memory operations with a scalar data-type, executed in
343 // an unspecified order on the elements in the vector.
344 unsigned getMaxRequiredAlignment() const { return 8; }
345 // Get the smallest cmpxchg word size that the hardware supports.
346 unsigned getMinCmpXchgSizeInBits() const { return 32; }
347
348 unsigned getPTXVersion() const { return PTXVersion; }
349
350 NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
351 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
352
353 void failIfClustersUnsupported(std::string const &FailureMessage) const;
354};
355
356} // End llvm namespace
357
358#endif
359