1//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the NVPTX specific subclass of TargetSubtarget.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15
16#include "NVPTX.h"
17#include "NVPTXFrameLowering.h"
18#include "NVPTXISelLowering.h"
19#include "NVPTXInstrInfo.h"
20#include "NVPTXRegisterInfo.h"
21#include "llvm/CodeGen/TargetSubtargetInfo.h"
22#include "llvm/IR/DataLayout.h"
23#include "llvm/IR/NVVMIntrinsicUtils.h"
24#include "llvm/Support/NVPTXAddrSpace.h"
25#include <string>
26
27#define GET_SUBTARGETINFO_HEADER
28#include "NVPTXGenSubtargetInfo.inc"
29
30namespace llvm {
31
32class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
33 virtual void anchor();
34 std::string TargetName;
35
36 // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
37 unsigned PTXVersion;
38
39 // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
40 // sm_90a == 901
41 unsigned int FullSmVersion;
42
43 // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
44 // FullSmVersion.
45 unsigned int SmVersion;
46
47 NVPTXInstrInfo InstrInfo;
48 NVPTXTargetLowering TLInfo;
49 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
50
51 // NVPTX does not have any call stack frame, but need a NVPTX specific
52 // FrameLowering class because TargetFrameLowering is abstract.
53 NVPTXFrameLowering FrameLowering;
54
55public:
56 /// This constructor initializes the data members to match that
57 /// of the specified module.
58 ///
59 NVPTXSubtarget(const Triple &TT, const std::string &CPU,
60 const std::string &FS, const NVPTXTargetMachine &TM);
61
62 ~NVPTXSubtarget() override;
63
64 const TargetFrameLowering *getFrameLowering() const override {
65 return &FrameLowering;
66 }
67 const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
68 const NVPTXRegisterInfo *getRegisterInfo() const override {
69 return &InstrInfo.getRegisterInfo();
70 }
71 const NVPTXTargetLowering *getTargetLowering() const override {
72 return &TLInfo;
73 }
74
75 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
76
77 // Checks PTX version and family-specific and architecture-specific SM
78 // versions. For example, sm_100{f/a} and any future variants in the same
79 // family will match for any PTX version greater than or equal to
80 // `PTXVersion`.
81 bool hasPTXWithFamilySMs(unsigned PTXVersion,
82 ArrayRef<unsigned> SMVersions) const;
83 // Checks PTX version and architecture-specific SM versions.
84 // For example, sm_100{a} will match for any PTX version greater than or equal
85 // to `PTXVersion`.
86 bool hasPTXWithAccelSMs(unsigned PTXVersion,
87 ArrayRef<unsigned> SMVersions) const;
88
89 bool has256BitVectorLoadStore(unsigned AS) const {
90 return SmVersion >= 100 && PTXVersion >= 88 &&
91 AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
92 }
93 bool hasUsedBytesMaskPragma() const {
94 return SmVersion >= 50 && PTXVersion >= 83;
95 }
96 bool hasAtomAddF64() const { return SmVersion >= 60; }
97 bool hasAtomScope() const { return SmVersion >= 60; }
98 bool hasAtomBitwise64() const { return SmVersion >= 32; }
99 bool hasAtomMinMax64() const { return SmVersion >= 32; }
100 bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
101 bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
102 bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
103 bool hasLDG() const { return SmVersion >= 32; }
104 bool hasHWROT32() const { return SmVersion >= 32; }
105 bool hasBrx() const { return SmVersion >= 30 && PTXVersion >= 60; }
106 bool hasFP16Math() const { return SmVersion >= 53; }
107 bool hasBF16Math() const { return SmVersion >= 80; }
108 bool allowFP16Math() const;
109 bool hasMaskOperator() const { return PTXVersion >= 71; }
110 bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
111 // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
112 // release, acq_rel, sc) ?
113 bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
114 // Does SM & PTX support .acquire and .release qualifiers for fence?
115 bool hasSplitAcquireAndReleaseFences() const {
116 return SmVersion >= 90 && PTXVersion >= 86;
117 }
118 // Does SM & PTX support atomic relaxed MMIO operations ?
119 bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
120 bool hasDotInstructions() const {
121 return SmVersion >= 61 && PTXVersion >= 50;
122 }
123
124 // Checks following instructions support:
125 // - tcgen05.ld/st
126 // - tcgen05.alloc/dealloc/relinquish
127 // - tcgen05.cp
128 // - tcgen05.fence/wait
129 // - tcgen05.commit
130 // - tcgen05.mma
131 bool hasTcgen05InstSupport() const {
132 // sm_101 renamed to sm_110 in PTX 9.0
133 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110}) ||
134 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101}) ||
135 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
136 }
137
138 // Checks tcgen05.shift instruction support.
139 bool hasTcgen05ShiftSupport() const {
140 // sm_101 renamed to sm_110 in PTX 9.0
141 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
142 hasPTXWithAccelSMs(PTXVersion: 88, SMVersions: {100, 101, 103}) ||
143 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
144 }
145
146 bool hasTcgen05MMAScaleInputDImm() const {
147 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100}) || hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100});
148 }
149
150 bool hasTcgen05MMAI8Kind() const {
151 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110}) ||
152 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
153 }
154
155 bool hasTcgen05MMASparseMxf4nvf4() const {
156 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
157 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 103});
158 }
159
160 bool hasTcgen05MMASparseMxf4() const {
161 return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) ||
162 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 103});
163 }
164
165 bool hasTcgen05LdRedSupport() const {
166 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {110, 103}) ||
167 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {101, 103});
168 }
169
170 bool hasReduxSyncF32() const {
171 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100}) || hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100});
172 }
173
174 bool hasMMABlockScale() const {
175 return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {120}) || hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {120});
176 }
177
178 bool hasMMASparseBlockScaleF4() const {
179 return hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {120, 121});
180 }
181
182 // f32x2 instructions in Blackwell family
183 bool hasF32x2Instructions() const;
184
185 // Checks support for following in TMA:
186 // - cta_group::1/2 support
187 // - im2col_w/w_128 mode support
188 // - tile_gather4 mode support
189 // - tile_scatter4 mode support
190 bool hasTMABlackwellSupport() const {
191 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110}) ||
192 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101}) ||
193 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101});
194 }
195
196 // Checks support for conversions involving e4m3x2 and e5m2x2.
197 bool hasFP8ConversionSupport() const {
198 if (PTXVersion >= 81)
199 return SmVersion >= 89;
200
201 if (PTXVersion >= 78)
202 return SmVersion >= 90;
203
204 return false;
205 }
206
207 // Checks support for conversions involving the following types:
208 // - e2m3x2/e3m2x2
209 // - e2m1x2
210 // - ue8m0x2
211 bool hasNarrowFPConversionSupport() const {
212 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
213 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
214 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
215 }
216
217 bool hasTensormapReplaceSupport() const {
218 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {90, 100, 110, 120}) ||
219 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {90, 100, 101, 120}) ||
220 hasPTXWithAccelSMs(PTXVersion: 83, SMVersions: {90, 100, 101, 120});
221 }
222
223 bool hasTensormapReplaceElemtypeSupport(unsigned value) const {
224 if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16))
225 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
226 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
227 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 120});
228
229 return hasTensormapReplaceSupport();
230 }
231
232 bool hasTensormapReplaceSwizzleAtomicitySupport() const {
233 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
234 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
235 hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 120});
236 }
237
238 bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const {
239 if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B))
240 return hasPTXWithAccelSMs(PTXVersion: 88, SMVersions: {103});
241
242 return hasTensormapReplaceSupport();
243 }
244
245 bool hasClusterLaunchControlTryCancelMulticastSupport() const {
246 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
247 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
248 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
249 }
250
251 bool hasSetMaxNRegSupport() const {
252 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
253 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
254 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120}) ||
255 hasPTXWithAccelSMs(PTXVersion: 80, SMVersions: {90});
256 }
257
258 bool hasLdStmatrixBlackwellSupport() const {
259 return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) ||
260 hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) ||
261 hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120});
262 }
263
264 // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
265 // terminates a basic block. Instead, it would assume that control flow
266 // continued to the next instruction. The next instruction could be in the
267 // block that's lexically below it. This would lead to a phantom CFG edges
268 // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
269 // PTX ISA versions 8.3+ we can confidently say that the bug will not be
270 // present.
271 bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
272 bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
273 bool hasConvertWithStochasticRounding() const {
274 return hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 103});
275 }
276 unsigned int getFullSmVersion() const { return FullSmVersion; }
277 unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
278 unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
279 // GPUs with "a" suffix have architecture-accelerated features that are
280 // supported on the specified architecture only, hence such targets do not
281 // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
282 // such GPU variants from the base GPU architecture.
283 // - false represents non-accelerated architecture.
284 // - true represents architecture-accelerated variant.
285 bool hasArchAccelFeatures() const {
286 return (getFullSmVersion() & 1) && PTXVersion >= 80;
287 }
288 // GPUs with 'f' suffix have architecture-accelerated features which are
289 // portable across all future architectures under same SM major. For example,
290 // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
291 // - false represents non-family-specific architecture.
292 // - true represents family-specific variant.
293 bool hasFamilySpecificFeatures() const {
294 return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
295 : hasArchAccelFeatures();
296 }
297 // If the user did not provide a target we default to the `sm_75` target.
298 std::string getTargetName() const {
299 return TargetName.empty() ? "sm_75" : TargetName;
300 }
301 bool hasTargetName() const { return !TargetName.empty(); }
302
303 bool hasNativeBF16Support(int Opcode) const;
304
305 // Get maximum value of required alignments among the supported data types.
306 // From the PTX ISA doc, section 8.2.3:
307 // The memory consistency model relates operations executed on memory
308 // locations with scalar data-types, which have a maximum size and alignment
309 // of 64 bits. Memory operations with a vector data-type are modelled as a
310 // set of equivalent memory operations with a scalar data-type, executed in
311 // an unspecified order on the elements in the vector.
312 unsigned getMaxRequiredAlignment() const { return 8; }
313 // Get the smallest cmpxchg word size that the hardware supports.
314 unsigned getMinCmpXchgSizeInBits() const { return 32; }
315
316 unsigned getPTXVersion() const { return PTXVersion; }
317
318 NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
319 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
320
321 void failIfClustersUnsupported(std::string const &FailureMessage) const;
322};
323
324} // End llvm namespace
325
326#endif
327