NVPTXSubtarget.h source code [llvm_projects/llvm/lib/Target/NVPTX/NVPTXSubtarget.h]

1	//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---- C++ ---====//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file declares the NVPTX specific subclass of TargetSubtarget.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14	#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15
16	#include "NVPTX.h"
17	#include "NVPTXFrameLowering.h"
18	#include "NVPTXISelLowering.h"
19	#include "NVPTXInstrInfo.h"
20	#include "NVPTXRegisterInfo.h"
21	#include "llvm/CodeGen/TargetSubtargetInfo.h"
22	#include "llvm/IR/DataLayout.h"
23	#include "llvm/Support/NVPTXAddrSpace.h"
24	#include <string>
25
26	#define GET_SUBTARGETINFO_HEADER
27	#include "NVPTXGenSubtargetInfo.inc"
28
29	namespace llvm {
30
31	class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
32	virtual void anchor();
33	std::string TargetName;
34
35	// PTX version x.y is represented as 10x+y, e.g. 3.1 == 31*
36	unsigned PTXVersion;
37
38	// Full SM version x.y is represented as 100x+10y+feature, e.g. 3.1 == 310
39	// sm_90a == 901
40	unsigned int FullSmVersion;
41
42	// SM version x.y is represented as 10x+y, e.g. 3.1 == 31. Derived from*
43	// FullSmVersion.
44	unsigned int SmVersion;
45
46	NVPTXInstrInfo InstrInfo;
47	NVPTXTargetLowering TLInfo;
48	std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
49
50	// NVPTX does not have any call stack frame, but need a NVPTX specific
51	// FrameLowering class because TargetFrameLowering is abstract.
52	NVPTXFrameLowering FrameLowering;
53
54	public:
55	/// This constructor initializes the data members to match that
56	/// of the specified module.
57	///
58	NVPTXSubtarget(const Triple &TT, const std::string &CPU,
59	const std::string &FS, const NVPTXTargetMachine &TM);
60
61	~NVPTXSubtarget() override;
62
63	const TargetFrameLowering getFrameLowering() const* override {
64	return &FrameLowering;
65	}
66	const NVPTXInstrInfo getInstrInfo() const* override { return &InstrInfo; }
67	const NVPTXRegisterInfo getRegisterInfo() const* override {
68	return &InstrInfo.getRegisterInfo();
69	}
70	const NVPTXTargetLowering getTargetLowering() const* override {
71	return &TLInfo;
72	}
73
74	const SelectionDAGTargetInfo getSelectionDAGInfo() const* override;
75
76	bool has256BitVectorLoadStore(unsigned AS) const {
77	return SmVersion >= `100` && PTXVersion >= `88` &&
78	AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
79	}
80	bool hasAtomAddF64() const { return SmVersion >= `60`; }
81	bool hasAtomScope() const { return SmVersion >= `60`; }
82	bool hasAtomBitwise64() const { return SmVersion >= `32`; }
83	bool hasAtomMinMax64() const { return SmVersion >= `32`; }
84	bool hasAtomCas16() const { return SmVersion >= `70` && PTXVersion >= `63`; }
85	bool hasClusters() const { return SmVersion >= `90` && PTXVersion >= `78`; }
86	bool hasLDG() const { return SmVersion >= `32`; }
87	bool hasHWROT32() const { return SmVersion >= `32`; }
88	bool hasFP16Math() const { return SmVersion >= `53`; }
89	bool hasBF16Math() const { return SmVersion >= `80`; }
90	bool allowFP16Math() const;
91	bool hasMaskOperator() const { return PTXVersion >= `71`; }
92	bool hasNoReturn() const { return SmVersion >= `30` && PTXVersion >= `64`; }
93	// Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
94	// release, acq_rel, sc) ?
95	bool hasMemoryOrdering() const { return SmVersion >= `70` && PTXVersion >= `60`; }
96	// Does SM & PTX support .acquire and .release qualifiers for fence?
97	bool hasSplitAcquireAndReleaseFences() const {
98	return SmVersion >= `90` && PTXVersion >= `86`;
99	}
100	// Does SM & PTX support atomic relaxed MMIO operations ?
101	bool hasRelaxedMMIO() const { return SmVersion >= `70` && PTXVersion >= `82`; }
102	bool hasDotInstructions() const {
103	return SmVersion >= `61` && PTXVersion >= `50`;
104	}
105	// Tcgen05 instructions in Blackwell family
106	bool hasTcgen05Instructions() const {
107	bool HasTcgen05 = false;
108	switch (FullSmVersion) {
109	default:
110	break;
111	case `1003`: // sm_100a
112	case `1013`: // sm_101a
113	HasTcgen05 = true;
114	break;
115	}
116
117	return HasTcgen05 && PTXVersion >= `86`;
118	}
119
120	// TMA G2S copy with cta_group::1/2 support
121	bool hasCpAsyncBulkTensorCTAGroupSupport() const {
122	// TODO: Update/tidy-up after the family-conditional support arrives
123	switch (FullSmVersion) {
124	case `1003`:
125	case `1013`:
126	return PTXVersion >= `86`;
127	case `1033`:
128	return PTXVersion >= `88`;
129	default:
130	return false;
131	}
132	}
133
134	// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
135	// terminates a basic block. Instead, it would assume that control flow
136	// continued to the next instruction. The next instruction could be in the
137	// block that's lexically below it. This would lead to a phantom CFG edges
138	// being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
139	// PTX ISA versions 8.3+ we can confidently say that the bug will not be
140	// present.
141	bool hasPTXASUnreachableBug() const { return PTXVersion < `83`; }
142	bool hasCvtaParam() const { return SmVersion >= `70` && PTXVersion >= `77`; }
143	unsigned int getFullSmVersion() const { return FullSmVersion; }
144	unsigned int getSmVersion() const { return getFullSmVersion() / `10`; }
145	// GPUs with "a" suffix have architecture-accelerated features that are
146	// supported on the specified architecture only, hence such targets do not
147	// follow the onion layer model. hasArchAccelFeatures() allows distinguishing
148	// such GPU variants from the base GPU architecture.
149	// - false represents non-accelerated architecture.
150	// - true represents architecture-accelerated variant.
151	bool hasArchAccelFeatures() const {
152	return (getFullSmVersion() & `1`) && PTXVersion >= `80`;
153	}
154	// GPUs with 'f' suffix have architecture-accelerated features which are
155	// portable across all future architectures under same SM major. For example,
156	// sm_100f features will work for sm_10Xf/sm_10Xa* future architectures.*
157	// - false represents non-family-specific architecture.
158	// - true represents family-specific variant.
159	bool hasFamilySpecificFeatures() const {
160	return getFullSmVersion() % `10` == `2` ? PTXVersion >= `88`
161	: hasArchAccelFeatures();
162	}
163	// If the user did not provide a target we default to the `sm_30` target.
164	std::string getTargetName() const {
165	return TargetName.empty() ? "sm_30" : TargetName;
166	}
167	bool hasTargetName() const { return !TargetName.empty(); }
168
169	bool hasNativeBF16Support(int Opcode) const;
170
171	// Get maximum value of required alignments among the supported data types.
172	// From the PTX ISA doc, section 8.2.3:
173	// The memory consistency model relates operations executed on memory
174	// locations with scalar data-types, which have a maximum size and alignment
175	// of 64 bits. Memory operations with a vector data-type are modelled as a
176	// set of equivalent memory operations with a scalar data-type, executed in
177	// an unspecified order on the elements in the vector.
178	unsigned getMaxRequiredAlignment() const { return `8`; }
179	// Get the smallest cmpxchg word size that the hardware supports.
180	unsigned getMinCmpXchgSizeInBits() const { return `32`; }
181
182	unsigned getPTXVersion() const { return PTXVersion; }
183
184	NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
185	void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
186
187	void failIfClustersUnsupported(std::string const &FailureMessage) const;
188	};
189
190	} // End llvm namespace
191
192	#endif
193

Browse the source code of llvm_projects/llvm/lib/Target/NVPTX/NVPTXSubtarget.h