| 1 | //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file declares the NVPTX specific subclass of TargetSubtarget. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H |
| 14 | #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H |
| 15 | |
| 16 | #include "NVPTX.h" |
| 17 | #include "NVPTXFrameLowering.h" |
| 18 | #include "NVPTXISelLowering.h" |
| 19 | #include "NVPTXInstrInfo.h" |
| 20 | #include "NVPTXRegisterInfo.h" |
| 21 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
| 22 | #include "llvm/IR/DataLayout.h" |
| 23 | #include "llvm/IR/NVVMIntrinsicUtils.h" |
| 24 | #include "llvm/Support/NVPTXAddrSpace.h" |
| 25 | #include <string> |
| 26 | |
| 27 | #define |
| 28 | #include "NVPTXGenSubtargetInfo.inc" |
| 29 | |
| 30 | namespace llvm { |
| 31 | |
| 32 | // FullSmVersion encoding: SM * 10 + ArchSuffixOffset |
| 33 | // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a') |
| 34 | // e.g. sm_100 -> 1000, sm_100f -> 1002, sm_100a -> 1003 |
| 35 | |
| 36 | class NVPTXSubtarget : public NVPTXGenSubtargetInfo { |
| 37 | virtual void anchor(); |
| 38 | std::string TargetName; |
| 39 | |
| 40 | // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31 |
| 41 | unsigned PTXVersion; |
| 42 | |
| 43 | // FullSmVersion encoding: SM * 10 + ArchSuffixOffset |
| 44 | // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a') |
| 45 | // e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002 |
| 46 | unsigned int FullSmVersion; |
| 47 | |
| 48 | // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from |
| 49 | // FullSmVersion. |
| 50 | unsigned int SmVersion; |
| 51 | |
| 52 | NVPTXInstrInfo InstrInfo; |
| 53 | NVPTXTargetLowering TLInfo; |
| 54 | std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; |
| 55 | |
| 56 | // NVPTX does not have any call stack frame, but need a NVPTX specific |
| 57 | // FrameLowering class because TargetFrameLowering is abstract. |
| 58 | NVPTXFrameLowering FrameLowering; |
| 59 | |
| 60 | public: |
| 61 | /// This constructor initializes the data members to match that |
| 62 | /// of the specified module. |
| 63 | /// |
| 64 | NVPTXSubtarget(const Triple &TT, const std::string &CPU, |
| 65 | const std::string &FS, const NVPTXTargetMachine &TM); |
| 66 | |
| 67 | ~NVPTXSubtarget() override; |
| 68 | |
| 69 | const TargetFrameLowering *getFrameLowering() const override { |
| 70 | return &FrameLowering; |
| 71 | } |
| 72 | const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } |
| 73 | const NVPTXRegisterInfo *getRegisterInfo() const override { |
| 74 | return &InstrInfo.getRegisterInfo(); |
| 75 | } |
| 76 | const NVPTXTargetLowering *getTargetLowering() const override { |
| 77 | return &TLInfo; |
| 78 | } |
| 79 | |
| 80 | const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; |
| 81 | |
| 82 | // Checks PTX version and family-specific and architecture-specific SM |
| 83 | // versions. For example, sm_100{f/a} and any future variants in the same |
| 84 | // family will match for any PTX version greater than or equal to |
| 85 | // `PTXVersion`. |
| 86 | bool hasPTXWithFamilySMs(unsigned PTXVersion, |
| 87 | ArrayRef<unsigned> SMVersions) const; |
| 88 | // Checks PTX version and architecture-specific SM versions. |
| 89 | // For example, sm_100{a} will match for any PTX version greater than or equal |
| 90 | // to `PTXVersion`. |
| 91 | bool hasPTXWithAccelSMs(unsigned PTXVersion, |
| 92 | ArrayRef<unsigned> SMVersions) const; |
| 93 | |
| 94 | bool has256BitVectorLoadStore(unsigned AS) const { |
| 95 | return SmVersion >= 100 && PTXVersion >= 88 && |
| 96 | AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; |
| 97 | } |
| 98 | bool hasUsedBytesMaskPragma() const { |
| 99 | return SmVersion >= 50 && PTXVersion >= 83; |
| 100 | } |
| 101 | bool hasAtomAddF64() const { return SmVersion >= 60; } |
| 102 | bool hasAtomScope() const { return SmVersion >= 60; } |
| 103 | bool hasAtomBitwise64() const { return SmVersion >= 32; } |
| 104 | bool hasAtomMinMax64() const { return SmVersion >= 32; } |
| 105 | bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } |
| 106 | bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; } |
| 107 | bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } |
| 108 | bool hasLDG() const { return SmVersion >= 32; } |
| 109 | bool hasHWROT32() const { return SmVersion >= 32; } |
| 110 | bool hasBrx() const { return SmVersion >= 30 && PTXVersion >= 60; } |
| 111 | bool hasFP16Math() const { return SmVersion >= 53; } |
| 112 | bool hasBF16Math() const { return SmVersion >= 80; } |
| 113 | bool allowFP16Math() const; |
| 114 | bool hasMaskOperator() const { return PTXVersion >= 71; } |
| 115 | bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } |
| 116 | // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire, |
| 117 | // release, acq_rel, sc) ? |
| 118 | bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } |
| 119 | // Does SM & PTX support .acquire and .release qualifiers for fence? |
| 120 | bool hasSplitAcquireAndReleaseFences() const { |
| 121 | return SmVersion >= 90 && PTXVersion >= 86; |
| 122 | } |
| 123 | // Does SM & PTX support atomic relaxed MMIO operations ? |
| 124 | bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } |
| 125 | bool hasDotInstructions() const { |
| 126 | return SmVersion >= 61 && PTXVersion >= 50; |
| 127 | } |
| 128 | |
| 129 | // Checks following instructions support: |
| 130 | // - tcgen05.ld/st |
| 131 | // - tcgen05.alloc/dealloc/relinquish |
| 132 | // - tcgen05.cp |
| 133 | // - tcgen05.fence/wait |
| 134 | // - tcgen05.commit |
| 135 | // - tcgen05.mma |
| 136 | bool hasTcgen05InstSupport() const { |
| 137 | // sm_101 renamed to sm_110 in PTX 9.0 |
| 138 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110}) || |
| 139 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101}) || |
| 140 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101}); |
| 141 | } |
| 142 | |
| 143 | // Checks tcgen05.shift instruction support. |
| 144 | bool hasTcgen05ShiftSupport() const { |
| 145 | // sm_101 renamed to sm_110 in PTX 9.0 |
| 146 | return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) || |
| 147 | hasPTXWithAccelSMs(PTXVersion: 88, SMVersions: {100, 101, 103}) || |
| 148 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101}); |
| 149 | } |
| 150 | |
| 151 | bool hasTcgen05MMAScaleInputDImm() const { |
| 152 | return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100}) || hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100}); |
| 153 | } |
| 154 | |
| 155 | bool hasTcgen05MMAI8Kind() const { |
| 156 | return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110}) || |
| 157 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101}); |
| 158 | } |
| 159 | |
| 160 | bool hasTcgen05MMASparseMxf4nvf4() const { |
| 161 | return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) || |
| 162 | hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 103}); |
| 163 | } |
| 164 | |
| 165 | bool hasTcgen05MMASparseMxf4() const { |
| 166 | return hasPTXWithAccelSMs(PTXVersion: 90, SMVersions: {100, 110, 103}) || |
| 167 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 103}); |
| 168 | } |
| 169 | |
| 170 | bool hasTcgen05LdRedSupport() const { |
| 171 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {110, 103}) || |
| 172 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {101, 103}); |
| 173 | } |
| 174 | |
| 175 | bool hasReduxSyncF32() const { |
| 176 | return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100}) || hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100}); |
| 177 | } |
| 178 | |
| 179 | bool hasMMABlockScale() const { |
| 180 | return hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {120}) || hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {120}); |
| 181 | } |
| 182 | |
| 183 | bool hasMMASparseBlockScaleF4() const { |
| 184 | return hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {120, 121}); |
| 185 | } |
| 186 | |
| 187 | bool hasMMAWithMXF4NVF4Scale4xE8M0() const { |
| 188 | return hasPTXWithFamilySMs(PTXVersion: 91, SMVersions: {120}); |
| 189 | } |
| 190 | |
| 191 | bool hasMMASparseWithMXF4NVF4Scale4xE8M0() const { |
| 192 | return hasPTXWithAccelSMs(PTXVersion: 91, SMVersions: {120, 121}); |
| 193 | } |
| 194 | |
| 195 | // f32x2 instructions in Blackwell family |
| 196 | bool hasF32x2Instructions() const; |
| 197 | |
| 198 | // Checks support for following in TMA: |
| 199 | // - cta_group::1/2 support |
| 200 | // - im2col_w/w_128 mode support |
| 201 | // - tile_gather4 mode support |
| 202 | // - tile_scatter4 mode support |
| 203 | bool hasTMABlackwellSupport() const { |
| 204 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110}) || |
| 205 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101}) || |
| 206 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101}); |
| 207 | } |
| 208 | |
| 209 | // Checks support for conversions involving e4m3x2 and e5m2x2. |
| 210 | bool hasFP8ConversionSupport() const { |
| 211 | if (PTXVersion >= 81) |
| 212 | return SmVersion >= 89; |
| 213 | |
| 214 | if (PTXVersion >= 78) |
| 215 | return SmVersion >= 90; |
| 216 | |
| 217 | return false; |
| 218 | } |
| 219 | |
| 220 | // Checks support for conversions involving the following types: |
| 221 | // - e2m3x2/e3m2x2 |
| 222 | // - e2m1x2 |
| 223 | // - ue8m0x2 |
| 224 | bool hasNarrowFPConversionSupport() const { |
| 225 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) || |
| 226 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) || |
| 227 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120}); |
| 228 | } |
| 229 | |
| 230 | // Checks support for conversions involving the following types: |
| 231 | // - bf16x2 -> f8x2 |
| 232 | // - f16x2 -> f6x2 |
| 233 | // - bf16x2 -> f6x2 |
| 234 | // - f16x2 -> f4x2 |
| 235 | // - bf16x2 -> f4x2 |
| 236 | bool hasFP16X2ToNarrowFPConversionSupport() const { |
| 237 | return hasPTXWithFamilySMs(PTXVersion: 91, SMVersions: {100, 110, 120}); |
| 238 | } |
| 239 | |
| 240 | bool hasS2F6X2ConversionSupport() const { |
| 241 | return hasPTXWithAccelSMs(PTXVersion: 91, SMVersions: {100, 103, 110, 120, 121}); |
| 242 | } |
| 243 | |
| 244 | bool hasTensormapReplaceSupport() const { |
| 245 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {90, 100, 110, 120}) || |
| 246 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {90, 100, 101, 120}) || |
| 247 | hasPTXWithAccelSMs(PTXVersion: 83, SMVersions: {90, 100, 101, 120}); |
| 248 | } |
| 249 | |
| 250 | bool hasTensormapReplaceElemtypeSupport(unsigned value) const { |
| 251 | if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16)) |
| 252 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) || |
| 253 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) || |
| 254 | hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 120}); |
| 255 | |
| 256 | return hasTensormapReplaceSupport(); |
| 257 | } |
| 258 | |
| 259 | bool hasTensormapReplaceSwizzleAtomicitySupport() const { |
| 260 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) || |
| 261 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) || |
| 262 | hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 101, 120}); |
| 263 | } |
| 264 | |
| 265 | bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const { |
| 266 | if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B)) |
| 267 | return hasPTXWithAccelSMs(PTXVersion: 88, SMVersions: {103}); |
| 268 | |
| 269 | return hasTensormapReplaceSupport(); |
| 270 | } |
| 271 | |
| 272 | bool hasClusterLaunchControlTryCancelMulticastSupport() const { |
| 273 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) || |
| 274 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) || |
| 275 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120}); |
| 276 | } |
| 277 | |
| 278 | bool hasSetMaxNRegSupport() const { |
| 279 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) || |
| 280 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) || |
| 281 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120}) || |
| 282 | hasPTXWithAccelSMs(PTXVersion: 80, SMVersions: {90}); |
| 283 | } |
| 284 | |
| 285 | bool hasLdStmatrixBlackwellSupport() const { |
| 286 | return hasPTXWithFamilySMs(PTXVersion: 90, SMVersions: {100, 110, 120}) || |
| 287 | hasPTXWithFamilySMs(PTXVersion: 88, SMVersions: {100, 101, 120}) || |
| 288 | hasPTXWithAccelSMs(PTXVersion: 86, SMVersions: {100, 101, 120}); |
| 289 | } |
| 290 | |
| 291 | // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction |
| 292 | // terminates a basic block. Instead, it would assume that control flow |
| 293 | // continued to the next instruction. The next instruction could be in the |
| 294 | // block that's lexically below it. This would lead to a phantom CFG edges |
| 295 | // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when |
| 296 | // PTX ISA versions 8.3+ we can confidently say that the bug will not be |
| 297 | // present. |
| 298 | bool hasPTXASUnreachableBug() const { return PTXVersion < 83; } |
| 299 | bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } |
| 300 | bool hasConvertWithStochasticRounding() const { |
| 301 | return hasPTXWithAccelSMs(PTXVersion: 87, SMVersions: {100, 103}); |
| 302 | } |
| 303 | unsigned int getFullSmVersion() const { return FullSmVersion; } |
| 304 | unsigned int getSmVersion() const { return getFullSmVersion() / 10; } |
| 305 | unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; } |
| 306 | // GPUs with "a" suffix have architecture-accelerated features that are |
| 307 | // supported on the specified architecture only, hence such targets do not |
| 308 | // follow the onion layer model. hasArchAccelFeatures() allows distinguishing |
| 309 | // such GPU variants from the base GPU architecture. |
| 310 | // - false represents non-accelerated architecture. |
| 311 | // - true represents architecture-accelerated variant. |
| 312 | bool hasArchAccelFeatures() const { |
| 313 | return (getFullSmVersion() & 1) && PTXVersion >= 80; |
| 314 | } |
| 315 | // GPUs with 'f' suffix have architecture-accelerated features which are |
| 316 | // portable across all future architectures under same SM major. For example, |
| 317 | // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures. |
| 318 | // - false represents non-family-specific architecture. |
| 319 | // - true represents family-specific variant. |
| 320 | bool hasFamilySpecificFeatures() const { |
| 321 | return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88 |
| 322 | : hasArchAccelFeatures(); |
| 323 | } |
| 324 | // If the user did not provide a target we default to the `sm_75` target. |
| 325 | std::string getTargetName() const { |
| 326 | return TargetName.empty() ? "sm_75" : TargetName; |
| 327 | } |
| 328 | bool hasTargetName() const { return !TargetName.empty(); } |
| 329 | |
| 330 | bool hasNativeBF16Support(int Opcode) const; |
| 331 | |
| 332 | // Get maximum value of required alignments among the supported data types. |
| 333 | // From the PTX ISA doc, section 8.2.3: |
| 334 | // The memory consistency model relates operations executed on memory |
| 335 | // locations with scalar data-types, which have a maximum size and alignment |
| 336 | // of 64 bits. Memory operations with a vector data-type are modelled as a |
| 337 | // set of equivalent memory operations with a scalar data-type, executed in |
| 338 | // an unspecified order on the elements in the vector. |
| 339 | unsigned getMaxRequiredAlignment() const { return 8; } |
| 340 | // Get the smallest cmpxchg word size that the hardware supports. |
| 341 | unsigned getMinCmpXchgSizeInBits() const { return 32; } |
| 342 | |
| 343 | unsigned getPTXVersion() const { return PTXVersion; } |
| 344 | |
| 345 | NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); |
| 346 | void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); |
| 347 | |
| 348 | void failIfClustersUnsupported(std::string const &FailureMessage) const; |
| 349 | }; |
| 350 | |
| 351 | } // End llvm namespace |
| 352 | |
| 353 | #endif |
| 354 | |