AMDGPUPreloadKernArgProlog.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp]

1	//===- AMDGPUPreloadKernArgProlog.cpp - Preload KernArg Prolog ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file This pass creates a backward compatibility layer for kernel argument
10	/// preloading in situations where code is compiled with kernel argument
11	/// preloading enabled but executed on hardware without firmware support for it.
12	///
13	/// To avoid recompilation, the pass inserts a block at the beginning of the
14	/// program that loads the kernel arguments into SGPRs using s_load
15	/// instructions. This sets up the registers exactly as they would be on systems
16	/// with compatible firmware.
17	///
18	/// This effectively creates two entry points for the kernel. Firmware that
19	/// supports the feature will automatically jump past the first 256 bytes of the
20	/// program, skipping the compatibility layer and directly starting execution on
21	/// the optimized code path.
22	///
23	/// This pass should be run as late as possible to prevent any optimizations
24	/// that might assume the padding is dead code or that the added prologue is a
25	/// true predecessor of the kernel entry block.
26	//
27	//===----------------------------------------------------------------------===//
28
29	#include "AMDGPUPreloadKernArgProlog.h"
30	#include "AMDGPU.h"
31	#include "GCNSubtarget.h"
32	#include "SIMachineFunctionInfo.h"
33	#include "llvm/CodeGen/MachineFunctionPass.h"
34	#include "llvm/TargetParser/TargetParser.h"
35
36	using namespace llvm;
37
38	#define DEBUG_TYPE "amdgpu-preload-kern-arg-prolog"
39
40	namespace {
41
42	// Used to build s_loads maping user SGPRs to kernel arguments
43	struct LoadConfig {
44	unsigned Size;
45	const TargetRegisterClass *RegClass;
46	unsigned Opcode;
47	Register LoadReg = Register ();
48	};
49
50	class AMDGPUPreloadKernArgProlog {
51	public:
52	AMDGPUPreloadKernArgProlog(MachineFunction &MF);
53
54	bool run();
55
56	private:
57	MachineFunction &MF;
58	const GCNSubtarget &ST;
59	const SIMachineFunctionInfo &MFI;
60	const SIInstrInfo &TII;
61	const TargetRegisterInfo &TRI;
62
63	// Create a new block before the entry point to the kernel. Firmware that
64	// supports preloading kernel arguments will automatically jump past this
65	// block to the alternative kernel entry point.
66	void createBackCompatBlock(unsigned NumKernArgPreloadSGPRs);
67
68	// Add instructions to load kernel arguments into SGPRs.
69	void addBackCompatLoads(MachineBasicBlock *BackCompatMBB,
70	Register KernArgSegmentPtr,
71	unsigned NumKernArgPreloadSGPRs);
72	};
73
74	class AMDGPUPreloadKernArgPrologLegacy : public MachineFunctionPass {
75	public:
76	static char ID;
77
78	AMDGPUPreloadKernArgPrologLegacy() : MachineFunctionPass (ID) {}
79
80	StringRef getPassName() const override {
81	return "AMDGPU Preload Kernel Arguments Prolog";
82	}
83
84	bool runOnMachineFunction(MachineFunction &MF) override;
85	};
86
87	} // end anonymous namespace
88
89	char AMDGPUPreloadKernArgPrologLegacy::ID = `0`;
90
91	INITIALIZE_PASS(AMDGPUPreloadKernArgPrologLegacy, DEBUG_TYPE,
92	"AMDGPU Preload Kernel Arguments Prolog", false, false)
93
94	char &llvm::AMDGPUPreloadKernArgPrologLegacyID =
95	AMDGPUPreloadKernArgPrologLegacy::ID;
96
97	FunctionPass *llvm::createAMDGPUPreloadKernArgPrologLegacyPass() {
98	return new AMDGPUPreloadKernArgPrologLegacy ();
99	}
100
101	bool AMDGPUPreloadKernArgPrologLegacy::runOnMachineFunction(
102	MachineFunction &MF) {
103	return AMDGPUPreloadKernArgProlog (MF).run();
104	}
105
106	AMDGPUPreloadKernArgProlog::AMDGPUPreloadKernArgProlog(MachineFunction &MF)
107	: MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
108	MFI(MF.getInfo<SIMachineFunctionInfo>()), TII(ST.getInstrInfo()),
109	TRI(*ST.getRegisterInfo()) {}
110
111	bool AMDGPUPreloadKernArgProlog::run() {
112	if (!ST.hasKernargPreload())
113	return false;
114
115	unsigned NumKernArgPreloadSGPRs = MFI.getNumKernargPreloadedSGPRs();
116	if (!NumKernArgPreloadSGPRs)
117	return false;
118
119	createBackCompatBlock(NumKernArgPreloadSGPRs);
120	return true;
121	}
122
123	void AMDGPUPreloadKernArgProlog::createBackCompatBlock(
124	unsigned NumKernArgPreloadSGPRs) {
125	auto KernelEntryMBB = MF.begin();
126	MachineBasicBlock *BackCompatMBB = MF.CreateMachineBasicBlock();
127	MF.insert(MBBI: KernelEntryMBB, MBB: BackCompatMBB);
128
129	assert(MFI.getUserSGPRInfo().hasKernargSegmentPtr() &&
130	"Kernel argument segment pointer register not set.");
131	Register KernArgSegmentPtr = MFI.getArgInfo().KernargSegmentPtr.getRegister();
132	BackCompatMBB->addLiveIn(PhysReg: KernArgSegmentPtr);
133
134	// Load kernel arguments to SGPRs
135	addBackCompatLoads(BackCompatMBB, KernArgSegmentPtr, NumKernArgPreloadSGPRs);
136
137	// Wait for loads to complete
138	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST.getCPU());
139	unsigned Waitcnt =
140	AMDGPU::encodeWaitcnt(Version: IV, Vmcnt: getVmcntBitMask(Version: IV), Expcnt: getExpcntBitMask(Version: IV), Lgkmcnt: `0`);
141	BuildMI(BB: BackCompatMBB, MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_WAITCNT))
142	.addImm(Val: Waitcnt);
143
144	// Branch to kernel start
145	BuildMI(BB: BackCompatMBB, MIMD: DebugLoc (), MCID: TII.get(Opcode: AMDGPU::S_BRANCH))
146	.addMBB(MBB: &*KernelEntryMBB);
147	BackCompatMBB->addSuccessor(Succ: &*KernelEntryMBB);
148
149	// Create a new basic block for padding to 256 bytes
150	MachineBasicBlock *PadMBB = MF.CreateMachineBasicBlock();
151	MF.insert(MBBI: ++BackCompatMBB->getIterator(), MBB: PadMBB);
152	PadMBB->setAlignment(Align (`256`));
153	PadMBB->addSuccessor(Succ: &*KernelEntryMBB);
154	}
155
156	/// Find the largest possible load size that fits with SGPR alignment
157	static LoadConfig getLoadParameters(const TargetRegisterInfo &TRI,
158	Register KernArgPreloadSGPR,
159	unsigned NumKernArgPreloadSGPRs) {
160	static constexpr LoadConfig Configs[] = {
161	{.Size: `8`, .RegClass: &AMDGPU::SReg_256RegClass, .Opcode: AMDGPU::S_LOAD_DWORDX8_IMM},
162	{.Size: `4`, .RegClass: &AMDGPU::SReg_128RegClass, .Opcode: AMDGPU::S_LOAD_DWORDX4_IMM},
163	{.Size: `2`, .RegClass: &AMDGPU::SReg_64RegClass, .Opcode: AMDGPU::S_LOAD_DWORDX2_IMM}};
164
165	for (const auto &Config : Configs) {
166	if (NumKernArgPreloadSGPRs >= Config.Size) {
167	Register LoadReg = TRI.getMatchingSuperReg(Reg: KernArgPreloadSGPR,
168	SubIdx: AMDGPU::sub0, RC: Config.RegClass);
169	if (LoadReg) {
170	LoadConfig C(Config);
171	C.LoadReg = LoadReg;
172	return C;
173	}
174	}
175	}
176
177	// Fallback to a single register
178	return LoadConfig{.Size: `1`, .RegClass: &AMDGPU::SReg_32RegClass, .Opcode: AMDGPU::S_LOAD_DWORD_IMM,
179	.LoadReg: KernArgPreloadSGPR};
180	}
181
182	void AMDGPUPreloadKernArgProlog::addBackCompatLoads(
183	MachineBasicBlock *BackCompatMBB, Register KernArgSegmentPtr,
184	unsigned NumKernArgPreloadSGPRs) {
185	Register KernArgPreloadSGPR = MFI.getArgInfo().FirstKernArgPreloadReg;
186	unsigned Offset = `0`;
187	// Fill all user SGPRs used for kernarg preloading with sequential data from
188	// the kernarg segment
189	while (NumKernArgPreloadSGPRs > `0`) {
190	LoadConfig Config =
191	getLoadParameters(TRI, KernArgPreloadSGPR, NumKernArgPreloadSGPRs);
192
193	BuildMI(BB: BackCompatMBB, MIMD: DebugLoc (), MCID: TII.get(Opcode: Config.Opcode), DestReg: Config.LoadReg)
194	.addReg(RegNo: KernArgSegmentPtr)
195	.addImm(Val: Offset)
196	.addImm(Val: `0`);
197
198	Offset += `4` * Config.Size;
199	KernArgPreloadSGPR = KernArgPreloadSGPR.asMCReg() + Config.Size;
200	NumKernArgPreloadSGPRs -= Config.Size;
201	}
202	}
203
204	PreservedAnalyses
205	AMDGPUPreloadKernArgPrologPass::run(MachineFunction &MF,
206	MachineFunctionAnalysisManager &) {
207	if (!AMDGPUPreloadKernArgProlog (MF).run())
208	return PreservedAnalyses::all();
209
210	return getMachineFunctionPassPreservedAnalyses();
211	}
212

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp