1//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the NVPTX specific subclass of TargetSubtarget.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15
16#include "NVPTX.h"
17#include "NVPTXFrameLowering.h"
18#include "NVPTXISelLowering.h"
19#include "NVPTXInstrInfo.h"
20#include "NVPTXRegisterInfo.h"
21#include "llvm/CodeGen/TargetSubtargetInfo.h"
22#include "llvm/IR/DataLayout.h"
23#include "llvm/IR/NVVMIntrinsicUtils.h"
24#include "llvm/Support/NVPTXAddrSpace.h"
25#include <string>
26
27#define GET_SUBTARGETINFO_HEADER
28#include "NVPTXGenSubtargetInfo.inc"
29
30namespace llvm {
31
32// FullSmVersion encoding: SM * 10 + ArchSuffixOffset
33// ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
34// e.g. sm_100 -> 1000, sm_100f -> 1002, sm_100a -> 1003
35
36class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
37 virtual void anchor();
38 std::string TargetName;
39
40 // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
41 unsigned PTXVersion;
42
43 // FullSmVersion encoding: SM * 10 + ArchSuffixOffset
44 // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
45 // e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002
46 unsigned int FullSmVersion;
47
48 // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
49 // FullSmVersion.
50 unsigned int SmVersion;
51
52 NVPTXInstrInfo InstrInfo;
53 NVPTXTargetLowering TLInfo;
54 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
55
56 // NVPTX does not have any call stack frame, but need a NVPTX specific
57 // FrameLowering class because TargetFrameLowering is abstract.
58 NVPTXFrameLowering FrameLowering;
59
60public:
61 /// This constructor initializes the data members to match that
62 /// of the specified module.
63 ///
64 NVPTXSubtarget(const Triple &TT, const std::string &CPU,
65 const std::string &FS, const NVPTXTargetMachine &TM);
66
67 ~NVPTXSubtarget() override;
68
69 const TargetFrameLowering *getFrameLowering() const override {
70 return &FrameLowering;
71 }
72 const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
73 const NVPTXRegisterInfo *getRegisterInfo() const override {
74 return &InstrInfo.getRegisterInfo();
75 }
76 const NVPTXTargetLowering *getTargetLowering() const override {
77 return &TLInfo;
78 }
79
80 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
81
82 // Checks PTX version and family-specific and architecture-specific SM
83 // versions. For example, sm_100{f/a} and any future variants in the same
84 // family will match for any PTX version greater than or equal to
85 // `PTXVersion`.
86 bool hasPTXWithFamilySMs(unsigned PTXVersion,
87 ArrayRef<unsigned> SMVersions) const;
88 // Checks PTX version and architecture-specific SM versions.
89 // For example, sm_100{a} will match for any PTX version greater than or equal
90 // to `PTXVersion`.
91 bool hasPTXWithAccelSMs(unsigned PTXVersion,
92 ArrayRef<unsigned> SMVersions) const;
93
94 bool has256BitVectorLoadStore(unsigned AS) const {
95 return SmVersion >= 100 && PTXVersion >= 88 &&
96 AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
97 }
98 bool hasUsedBytesMaskPragma() const {
99 return SmVersion >= 50 && PTXVersion >= 83;
100 }
101 bool hasAtomAddF64() const { return SmVersion >= 60; }
102 bool hasAtomScope() const { return SmVersion >= 60; }
103 bool hasAtomBitwise64() const { return SmVersion >= 32; }
104 bool hasAtomMinMax64() const { return SmVersion >= 32; }
105 bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
106 bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
107 bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
108 bool hasLDG() const { return SmVersion >= 32; }
109 bool hasHWROT32() const { return SmVersion >= 32; }
110 bool hasBrx() const { return SmVersion >= 30 && PTXVersion >= 60; }
111 bool hasFP16Math() const { return SmVersion >= 53; }
112 bool hasBF16Math() const { return SmVersion >= 80; }
113 bool allowFP16Math() const;
114 bool hasMaskOperator() const { return PTXVersion >= 71; }
115 bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
116 // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
117 // release, acq_rel, sc) ?
118 bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
119 // Does SM & PTX support .acquire and .release qualifiers for fence?
120 bool hasSplitAcquireAndReleaseFences() const {
121 return SmVersion >= 90 && PTXVersion >= 86;
122 }
123 // Does SM & PTX support atomic relaxed MMIO operations ?
124 bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
125 bool hasDotInstructions() const {
126 return SmVersion >= 61 && PTXVersion >= 50;
127 }
128
129 // Checks following instructions support:
130 // - tcgen05.ld/st
131 // - tcgen05.alloc/dealloc/relinquish
132 // - tcgen05.cp
133 // - tcgen05.fence/wait
134 // - tcgen05.commit
135 // - tcgen05.mma
136 bool hasTcgen05InstSupport() const {
137 // sm_101 renamed to sm_110 in PTX 9.0
138 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110}) ||
139 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101}) ||
140 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
141 }
142
143 // Checks tcgen05.shift instruction support.
144 bool hasTcgen05ShiftSupport() const {
145 // sm_101 renamed to sm_110 in PTX 9.0
146 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
147 hasPTXWithAccelSMs(PTXVersion: 88, SMVersions: {100, 101, 103}) ||
148 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
149 }
150
151 bool hasTcgen05MMAScaleInputDImm() const {
152 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100}) || hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100});
153 }
154
155 bool hasTcgen05MMAI8Kind() const {
156 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110}) ||
157 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
158 }
159
160 bool hasTcgen05MMASparseMxf4nvf4() const {
161 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
162 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 103});
163 }
164
165 bool hasTcgen05MMASparseMxf4() const {
166 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
167 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 103});
168 }
169
170 bool hasTcgen05LdRedSupport() const {
171 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {110, 103}) ||
172 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {101, 103});
173 }
174
175 bool hasReduxSyncF32() const {
176 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100}) || hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100});
177 }
178
179 bool hasMMABlockScale() const {
180 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {120}) || hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {120});
181 }
182
183 bool hasMMASparseBlockScaleF4() const {
184 return hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {120, 121});
185 }
186
187 bool hasMMAWithMXF4NVF4Scale4xE8M0() const {
188 return hasPTXWithFamilySMs(PTXVersion: 91, SMVersions: {120});
189 }
190
191 bool hasMMASparseWithMXF4NVF4Scale4xE8M0() const {
192 return hasPTXWithAccelSMs(PTXVersion: 91, SMVersions: {120, 121});
193 }
194
195 // f32x2 instructions in Blackwell family
196 bool hasF32x2Instructions() const;
197
198 // Checks support for following in TMA:
199 // - cta_group::1/2 support
200 // - im2col_w/w_128 mode support
201 // - tile_gather4 mode support
202 // - tile_scatter4 mode support
203 bool hasTMABlackwellSupport() const {
204 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110}) ||
205 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101}) ||
206 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
207 }
208
209 // Checks support for conversions involving e4m3x2 and e5m2x2.
210 bool hasFP8ConversionSupport() const {
211 if (PTXVersion >= 81)
212 return SmVersion >= 89;
213
214 if (PTXVersion >= 78)
215 return SmVersion >= 90;
216
217 return false;
218 }
219
220 // Checks support for conversions involving the following types:
221 // - e2m3x2/e3m2x2
222 // - e2m1x2
223 // - ue8m0x2
224 bool hasNarrowFPConversionSupport() const {
225 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
226 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
227 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
228 }
229
230 // Checks support for conversions involving the following types:
231 // - bf16x2 -> f8x2
232 // - f16x2 -> f6x2
233 // - bf16x2 -> f6x2
234 // - f16x2 -> f4x2
235 // - bf16x2 -> f4x2
236 bool hasFP16X2ToNarrowFPConversionSupport() const {
237 return hasPTXWithFamilySMs(PTXVersion: 91, SMVersions: {100, 110, 120});
238 }
239
240 bool hasS2F6X2ConversionSupport() const {
241 return hasPTXWithAccelSMs(PTXVersion: 91, SMVersions: {100, 103, 110, 120, 121});
242 }
243
244 bool hasTensormapReplaceSupport() const {
245 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {90, 100, 110, 120}) ||
246 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {90, 100, 101, 120}) ||
247 hasPTXWithAccelSMs(PTXVersion: 83, SMVersions: {90, 100, 101, 120});
248 }
249
250 bool hasTensormapReplaceElemtypeSupport(unsigned value) const {
251 if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16))
252 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
253 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
254 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 120});
255
256 return hasTensormapReplaceSupport();
257 }
258
259 bool hasTensormapReplaceSwizzleAtomicitySupport() const {
260 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
261 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
262 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 120});
263 }
264
265 bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const {
266 if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B))
267 return hasPTXWithAccelSMs(PTXVersion: 88, SMVersions: {103});
268
269 return hasTensormapReplaceSupport();
270 }
271
272 bool hasClusterLaunchControlTryCancelMulticastSupport() const {
273 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
274 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
275 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
276 }
277
278 bool hasSetMaxNRegSupport() const {
279 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
280 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
281 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120}) ||
282 hasPTXWithAccelSMs(PTXVersion: 80, SMVersions: {90});
283 }
284
285 bool hasLdStmatrixBlackwellSupport() const {
286 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
287 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
288 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
289 }
290
291 // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
292 // terminates a basic block. Instead, it would assume that control flow
293 // continued to the next instruction. The next instruction could be in the
294 // block that's lexically below it. This would lead to a phantom CFG edges
295 // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
296 // PTX ISA versions 8.3+ we can confidently say that the bug will not be
297 // present.
298 bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
299 bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
300 bool hasConvertWithStochasticRounding() const {
301 return hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 103});
302 }
303 unsigned int getFullSmVersion() const { return FullSmVersion; }
304 unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
305 unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
306 // GPUs with "a" suffix have architecture-accelerated features that are
307 // supported on the specified architecture only, hence such targets do not
308 // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
309 // such GPU variants from the base GPU architecture.
310 // - false represents non-accelerated architecture.
311 // - true represents architecture-accelerated variant.
312 bool hasArchAccelFeatures() const {
313 return (getFullSmVersion() & 1) && PTXVersion >= 80;
314 }
315 // GPUs with 'f' suffix have architecture-accelerated features which are
316 // portable across all future architectures under same SM major. For example,
317 // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
318 // - false represents non-family-specific architecture.
319 // - true represents family-specific variant.
320 bool hasFamilySpecificFeatures() const {
321 return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
322 : hasArchAccelFeatures();
323 }
324 // If the user did not provide a target we default to the `sm_75` target.
325 std::string getTargetName() const {
326 return TargetName.empty() ? "sm_75" : TargetName;
327 }
328 bool hasTargetName() const { return !TargetName.empty(); }
329
330 bool hasNativeBF16Support(int Opcode) const;
331
332 // Get maximum value of required alignments among the supported data types.
333 // From the PTX ISA doc, section 8.2.3:
334 // The memory consistency model relates operations executed on memory
335 // locations with scalar data-types, which have a maximum size and alignment
336 // of 64 bits. Memory operations with a vector data-type are modelled as a
337 // set of equivalent memory operations with a scalar data-type, executed in
338 // an unspecified order on the elements in the vector.
339 unsigned getMaxRequiredAlignment() const { return 8; }
340 // Get the smallest cmpxchg word size that the hardware supports.
341 unsigned getMinCmpXchgSizeInBits() const { return 32; }
342
343 unsigned getPTXVersion() const { return PTXVersion; }
344
345 NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
346 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
347
348 void failIfClustersUnsupported(std::string const &FailureMessage) const;
349};
350
351} // End llvm namespace
352
353#endif
354