1 | //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file declares the NVPTX specific subclass of TargetSubtarget. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H |
14 | #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H |
15 | |
16 | #include "NVPTX.h" |
17 | #include "NVPTXFrameLowering.h" |
18 | #include "NVPTXISelLowering.h" |
19 | #include "NVPTXInstrInfo.h" |
20 | #include "NVPTXRegisterInfo.h" |
21 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
22 | #include "llvm/IR/DataLayout.h" |
23 | #include "llvm/Support/NVPTXAddrSpace.h" |
24 | #include <string> |
25 | |
26 | #define |
27 | #include "NVPTXGenSubtargetInfo.inc" |
28 | |
29 | namespace llvm { |
30 | |
31 | class NVPTXSubtarget : public NVPTXGenSubtargetInfo { |
32 | virtual void anchor(); |
33 | std::string TargetName; |
34 | |
35 | // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31 |
36 | unsigned PTXVersion; |
37 | |
38 | // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310 |
39 | // sm_90a == 901 |
40 | unsigned int FullSmVersion; |
41 | |
42 | // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from |
43 | // FullSmVersion. |
44 | unsigned int SmVersion; |
45 | |
46 | NVPTXInstrInfo InstrInfo; |
47 | NVPTXTargetLowering TLInfo; |
48 | std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; |
49 | |
50 | // NVPTX does not have any call stack frame, but need a NVPTX specific |
51 | // FrameLowering class because TargetFrameLowering is abstract. |
52 | NVPTXFrameLowering FrameLowering; |
53 | |
54 | public: |
55 | /// This constructor initializes the data members to match that |
56 | /// of the specified module. |
57 | /// |
58 | NVPTXSubtarget(const Triple &TT, const std::string &CPU, |
59 | const std::string &FS, const NVPTXTargetMachine &TM); |
60 | |
61 | ~NVPTXSubtarget() override; |
62 | |
63 | const TargetFrameLowering *getFrameLowering() const override { |
64 | return &FrameLowering; |
65 | } |
66 | const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } |
67 | const NVPTXRegisterInfo *getRegisterInfo() const override { |
68 | return &InstrInfo.getRegisterInfo(); |
69 | } |
70 | const NVPTXTargetLowering *getTargetLowering() const override { |
71 | return &TLInfo; |
72 | } |
73 | |
74 | const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; |
75 | |
76 | bool has256BitVectorLoadStore(unsigned AS) const { |
77 | return SmVersion >= 100 && PTXVersion >= 88 && |
78 | AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; |
79 | } |
80 | bool hasAtomAddF64() const { return SmVersion >= 60; } |
81 | bool hasAtomScope() const { return SmVersion >= 60; } |
82 | bool hasAtomBitwise64() const { return SmVersion >= 32; } |
83 | bool hasAtomMinMax64() const { return SmVersion >= 32; } |
84 | bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } |
85 | bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } |
86 | bool hasLDG() const { return SmVersion >= 32; } |
87 | bool hasHWROT32() const { return SmVersion >= 32; } |
88 | bool hasFP16Math() const { return SmVersion >= 53; } |
89 | bool hasBF16Math() const { return SmVersion >= 80; } |
90 | bool allowFP16Math() const; |
91 | bool hasMaskOperator() const { return PTXVersion >= 71; } |
92 | bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } |
93 | // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire, |
94 | // release, acq_rel, sc) ? |
95 | bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } |
96 | // Does SM & PTX support .acquire and .release qualifiers for fence? |
97 | bool hasSplitAcquireAndReleaseFences() const { |
98 | return SmVersion >= 90 && PTXVersion >= 86; |
99 | } |
100 | // Does SM & PTX support atomic relaxed MMIO operations ? |
101 | bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } |
102 | bool hasDotInstructions() const { |
103 | return SmVersion >= 61 && PTXVersion >= 50; |
104 | } |
105 | // Tcgen05 instructions in Blackwell family |
106 | bool hasTcgen05Instructions() const { |
107 | bool HasTcgen05 = false; |
108 | switch (FullSmVersion) { |
109 | default: |
110 | break; |
111 | case 1003: // sm_100a |
112 | case 1013: // sm_101a |
113 | HasTcgen05 = true; |
114 | break; |
115 | } |
116 | |
117 | return HasTcgen05 && PTXVersion >= 86; |
118 | } |
119 | |
120 | // TMA G2S copy with cta_group::1/2 support |
121 | bool hasCpAsyncBulkTensorCTAGroupSupport() const { |
122 | // TODO: Update/tidy-up after the family-conditional support arrives |
123 | switch (FullSmVersion) { |
124 | case 1003: |
125 | case 1013: |
126 | return PTXVersion >= 86; |
127 | case 1033: |
128 | return PTXVersion >= 88; |
129 | default: |
130 | return false; |
131 | } |
132 | } |
133 | |
134 | // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction |
135 | // terminates a basic block. Instead, it would assume that control flow |
136 | // continued to the next instruction. The next instruction could be in the |
137 | // block that's lexically below it. This would lead to a phantom CFG edges |
138 | // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when |
139 | // PTX ISA versions 8.3+ we can confidently say that the bug will not be |
140 | // present. |
141 | bool hasPTXASUnreachableBug() const { return PTXVersion < 83; } |
142 | bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } |
143 | unsigned int getFullSmVersion() const { return FullSmVersion; } |
144 | unsigned int getSmVersion() const { return getFullSmVersion() / 10; } |
145 | // GPUs with "a" suffix have architecture-accelerated features that are |
146 | // supported on the specified architecture only, hence such targets do not |
147 | // follow the onion layer model. hasArchAccelFeatures() allows distinguishing |
148 | // such GPU variants from the base GPU architecture. |
149 | // - false represents non-accelerated architecture. |
150 | // - true represents architecture-accelerated variant. |
151 | bool hasArchAccelFeatures() const { |
152 | return (getFullSmVersion() & 1) && PTXVersion >= 80; |
153 | } |
154 | // GPUs with 'f' suffix have architecture-accelerated features which are |
155 | // portable across all future architectures under same SM major. For example, |
156 | // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures. |
157 | // - false represents non-family-specific architecture. |
158 | // - true represents family-specific variant. |
159 | bool hasFamilySpecificFeatures() const { |
160 | return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88 |
161 | : hasArchAccelFeatures(); |
162 | } |
163 | // If the user did not provide a target we default to the `sm_30` target. |
164 | std::string getTargetName() const { |
165 | return TargetName.empty() ? "sm_30" : TargetName; |
166 | } |
167 | bool hasTargetName() const { return !TargetName.empty(); } |
168 | |
169 | bool hasNativeBF16Support(int Opcode) const; |
170 | |
171 | // Get maximum value of required alignments among the supported data types. |
172 | // From the PTX ISA doc, section 8.2.3: |
173 | // The memory consistency model relates operations executed on memory |
174 | // locations with scalar data-types, which have a maximum size and alignment |
175 | // of 64 bits. Memory operations with a vector data-type are modelled as a |
176 | // set of equivalent memory operations with a scalar data-type, executed in |
177 | // an unspecified order on the elements in the vector. |
178 | unsigned getMaxRequiredAlignment() const { return 8; } |
179 | // Get the smallest cmpxchg word size that the hardware supports. |
180 | unsigned getMinCmpXchgSizeInBits() const { return 32; } |
181 | |
182 | unsigned getPTXVersion() const { return PTXVersion; } |
183 | |
184 | NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); |
185 | void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); |
186 | |
187 | void failIfClustersUnsupported(std::string const &FailureMessage) const; |
188 | }; |
189 | |
190 | } // End llvm namespace |
191 | |
192 | #endif |
193 | |