1//===-- Main entry into the loader interface ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This utility is used to launch standard programs onto the GPU in conjunction
10// with the LLVM 'libc' project. It is designed to mimic a standard emulator
11// workflow, allowing for unit tests to be run on the GPU directly.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm-gpu-loader.h"
16
17#include "llvm/BinaryFormat/Magic.h"
18#include "llvm/Object/ELF.h"
19#include "llvm/Object/ELFObjectFile.h"
20#include "llvm/Support/CommandLine.h"
21#include "llvm/Support/Error.h"
22#include "llvm/Support/FileSystem.h"
23#include "llvm/Support/MemoryBuffer.h"
24#include "llvm/Support/Path.h"
25#include "llvm/Support/Signals.h"
26#include "llvm/Support/WithColor.h"
27#include "llvm/TargetParser/Triple.h"
28
29#include <cerrno>
30#include <cstdio>
31#include <cstdlib>
32#include <cstring>
33#include <string>
34
35using namespace llvm;
36
37static cl::OptionCategory LoaderCategory("loader options");
38
39static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden,
40 cl::cat(LoaderCategory));
41
42static cl::opt<unsigned>
43 ThreadsX("threads-x", cl::desc("Number of threads in the 'x' dimension"),
44 cl::init(Val: 1), cl::cat(LoaderCategory));
45static cl::opt<unsigned>
46 ThreadsY("threads-y", cl::desc("Number of threads in the 'y' dimension"),
47 cl::init(Val: 1), cl::cat(LoaderCategory));
48static cl::opt<unsigned>
49 ThreadsZ("threads-z", cl::desc("Number of threads in the 'z' dimension"),
50 cl::init(Val: 1), cl::cat(LoaderCategory));
51static cl::alias threads("threads", cl::aliasopt(ThreadsX),
52 cl::desc("Alias for --threads-x"),
53 cl::cat(LoaderCategory));
54
55static cl::opt<unsigned>
56 BlocksX("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
57 cl::init(Val: 1), cl::cat(LoaderCategory));
58static cl::opt<unsigned>
59 BlocksY("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
60 cl::init(Val: 1), cl::cat(LoaderCategory));
61static cl::opt<unsigned>
62 BlocksZ("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
63 cl::init(Val: 1), cl::cat(LoaderCategory));
64static cl::alias Blocks("blocks", cl::aliasopt(BlocksX),
65 cl::desc("Alias for --blocks-x"),
66 cl::cat(LoaderCategory));
67
68static cl::opt<std::string> File(cl::Positional, cl::Required,
69 cl::desc("<gpu executable>"),
70 cl::cat(LoaderCategory));
71static cl::list<std::string> Args(cl::ConsumeAfter,
72 cl::desc("<program arguments>..."),
73 cl::cat(LoaderCategory));
74
75// The arguments to the '_begin' kernel.
76struct BeginArgs {
77 int Argc;
78 void *Argv;
79 void *Envp;
80};
81
82// The arguments to the '_start' kernel.
83struct StartArgs {
84 int Argc;
85 void *Argv;
86 void *Envp;
87 void *Ret;
88};
89
90// The arguments to the '_end' kernel.
91struct EndArgs {};
92
93[[noreturn]] static void handleError(Error E) {
94 outs().flush();
95 logAllUnhandledErrors(E: std::move(E), OS&: WithColor::error(OS&: errs(), Prefix: "loader"));
96 exit(EXIT_FAILURE);
97}
98
99[[noreturn]] static void handleError(ol_result_t Err, unsigned Line) {
100 fprintf(stderr, format: "%s:%d %s\n", __FILE__, Line, Err->Details);
101 exit(EXIT_FAILURE);
102}
103
104#define OFFLOAD_ERR(X) \
105 if (ol_result_t Err = X) \
106 handleError(Err, __LINE__);
107
108static void *copyArgumentVector(int Argc, const char **Argv,
109 ol_device_handle_t Device) {
110 size_t ArgSize = sizeof(char *) * (Argc + 1);
111 size_t StringLen = 0;
112 for (int i = 0; i < Argc; ++i)
113 StringLen += strlen(s: Argv[i]) + 1;
114
115 // We allocate enough space for a null terminated array and all the strings.
116 void *DevArgv;
117 OFFLOAD_ERR(
118 olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv));
119 if (!DevArgv)
120 handleError(
121 E: createStringError(Fmt: "Failed to allocate memory for environment."));
122
123 // Store the strings linerally in the same memory buffer.
124 void *DevString = reinterpret_cast<uint8_t *>(DevArgv) + ArgSize;
125 for (int i = 0; i < Argc; ++i) {
126 size_t size = strlen(s: Argv[i]) + 1;
127 std::memcpy(dest: DevString, src: Argv[i], n: size);
128 static_cast<void **>(DevArgv)[i] = DevString;
129 DevString = reinterpret_cast<uint8_t *>(DevString) + size;
130 }
131
132 // Ensure the vector is null terminated.
133 reinterpret_cast<void **>(DevArgv)[Argc] = nullptr;
134 return DevArgv;
135}
136
137void *copyEnvironment(const char **Envp, ol_device_handle_t Device) {
138 int Envc = 0;
139 for (const char **Env = Envp; *Env != 0; ++Env)
140 ++Envc;
141
142 return copyArgumentVector(Argc: Envc, Argv: Envp, Device);
143}
144
145ol_device_handle_t findDevice(MemoryBufferRef Binary) {
146 ol_device_handle_t Device;
147 std::tuple Data = std::make_tuple(args: &Device, args: &Binary);
148 OFFLOAD_ERR(olIterateDevices(
149 [](ol_device_handle_t Device, void *UserData) {
150 auto &[Output, Binary] = *reinterpret_cast<decltype(Data) *>(UserData);
151 bool IsValid = false;
152 OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(),
153 Binary->getBufferSize(), &IsValid));
154 if (!IsValid)
155 return true;
156
157 *Output = Device;
158 return false;
159 },
160 &Data));
161 return Device;
162}
163
164ol_device_handle_t getHostDevice() {
165 ol_device_handle_t Device;
166 OFFLOAD_ERR(olIterateDevices(
167 [](ol_device_handle_t Device, void *UserData) {
168 ol_platform_handle_t Platform;
169 olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
170 &Platform);
171 ol_platform_backend_t Backend;
172 olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend),
173 &Backend);
174
175 auto &Output = *reinterpret_cast<decltype(Device) *>(UserData);
176 if (Backend == OL_PLATFORM_BACKEND_HOST) {
177 Output = Device;
178 return false;
179 }
180 return true;
181 },
182 &Device));
183 return Device;
184}
185
186template <typename Args>
187void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device,
188 ol_program_handle_t Program, const char *Name,
189 ol_kernel_launch_size_args_t LaunchArgs, Args &KernelArgs) {
190 ol_symbol_handle_t Kernel;
191 OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel));
192
193 OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &KernelArgs,
194 std::is_empty_v<Args> ? 0 : sizeof(Args),
195 &LaunchArgs));
196}
197
198int main(int argc, const char **argv, const char **envp) {
199 sys::PrintStackTraceOnErrorSignal(Argv0: argv[0]);
200 cl::HideUnrelatedOptions(Category&: LoaderCategory);
201 cl::ParseCommandLineOptions(
202 argc, argv,
203 Overview: "A utility used to launch unit tests built for a GPU target. This is\n"
204 "intended to provide an intrface simular to cross-compiling emulators\n");
205
206 if (Help) {
207 cl::PrintHelpMessage();
208 return EXIT_SUCCESS;
209 }
210
211 if (Error Err = loadLLVMOffload())
212 handleError(E: std::move(Err));
213
214 ErrorOr<std::unique_ptr<MemoryBuffer>> ImageOrErr =
215 MemoryBuffer::getFileOrSTDIN(Filename: File);
216 if (std::error_code EC = ImageOrErr.getError())
217 handleError(E: errorCodeToError(EC));
218 MemoryBufferRef Image = **ImageOrErr;
219
220 ol_platform_backend_t Backend;
221 ol_init_args_t InitArgs = OL_INIT_ARGS_INIT;
222
223 file_magic Magic = identify_magic(magic: Image.getBuffer());
224 if (Magic >= file_magic::elf && Magic <= file_magic::elf_core) {
225 Expected<object::ELFFile<object::ELF64LE>> ElfOrErr =
226 object::ELFFile<object::ELF64LE>::create(Object: Image.getBuffer());
227 if (!ElfOrErr)
228 handleError(E: ElfOrErr.takeError());
229
230 switch (ElfOrErr->getHeader().e_machine) {
231 case ELF::EM_AMDGPU:
232 Backend = OL_PLATFORM_BACKEND_AMDGPU;
233 break;
234 case ELF::EM_CUDA:
235 Backend = OL_PLATFORM_BACKEND_CUDA;
236 break;
237 default:
238 handleError(E: createStringError(
239 Fmt: "unhandled ELF architecture: %s",
240 Vals: ELF::convertEMachineToArchName(EMachine: ElfOrErr->getHeader().e_machine)
241 .data()));
242 }
243 InitArgs.NumPlatforms = 1;
244 InitArgs.Platforms = &Backend;
245 }
246
247 SmallVector<const char *> NewArgv = {File.c_str()};
248 llvm::transform(Range&: Args, d_first: std::back_inserter(x&: NewArgv),
249 F: [](const std::string &Arg) { return Arg.c_str(); });
250
251 OFFLOAD_ERR(olInit(&InitArgs));
252 ol_device_handle_t Device = findDevice(Binary: Image);
253 ol_device_handle_t Host = getHostDevice();
254
255 ol_program_handle_t Program;
256 OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(),
257 Image.getBufferSize(), &Program));
258
259 ol_queue_handle_t Queue;
260 OFFLOAD_ERR(olCreateQueue(Device, &Queue));
261
262 int DevArgc = static_cast<int>(NewArgv.size());
263 void *DevArgv = copyArgumentVector(Argc: NewArgv.size(), Argv: NewArgv.begin(), Device);
264 void *DevEnvp = copyEnvironment(Envp: envp, Device);
265
266 void *DevRet;
267 OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet));
268
269 ol_kernel_launch_size_args_t BeginLaunch{.Dimensions: 1, .NumGroups: {.x: 1, .y: 1, .z: 1}, .GroupSize: {.x: 1, .y: 1, .z: 1}, .DynSharedMemory: 0};
270 BeginArgs BeginArgs = {.Argc: DevArgc, .Argv: DevArgv, .Envp: DevEnvp};
271 launchKernel(Queue, Device, Program, Name: "_begin", LaunchArgs: BeginLaunch, KernelArgs&: BeginArgs);
272 OFFLOAD_ERR(olSyncQueue(Queue));
273
274 uint32_t Dims = (BlocksZ > 1) ? 3 : (BlocksY > 1) ? 2 : 1;
275 ol_kernel_launch_size_args_t StartLaunch{.Dimensions: Dims,
276 .NumGroups: {.x: BlocksX, .y: BlocksY, .z: BlocksZ},
277 .GroupSize: {.x: ThreadsX, .y: ThreadsY, .z: ThreadsZ},
278 /*SharedMemBytes=*/.DynSharedMemory: 0};
279 StartArgs StartArgs = {.Argc: DevArgc, .Argv: DevArgv, .Envp: DevEnvp, .Ret: DevRet};
280 launchKernel(Queue, Device, Program, Name: "_start", LaunchArgs: StartLaunch, KernelArgs&: StartArgs);
281
282 ol_kernel_launch_size_args_t EndLaunch{.Dimensions: 1, .NumGroups: {.x: 1, .y: 1, .z: 1}, .GroupSize: {.x: 1, .y: 1, .z: 1}, .DynSharedMemory: 0};
283 EndArgs EndArgs = {};
284 launchKernel(Queue, Device, Program, Name: "_end", LaunchArgs: EndLaunch, KernelArgs&: EndArgs);
285
286 int Ret;
287 OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int)));
288 OFFLOAD_ERR(olSyncQueue(Queue));
289
290 OFFLOAD_ERR(olMemFree(DevArgv));
291 OFFLOAD_ERR(olMemFree(DevEnvp));
292 OFFLOAD_ERR(olDestroyQueue(Queue));
293 OFFLOAD_ERR(olDestroyProgram(Program));
294 OFFLOAD_ERR(olShutDown());
295
296 return Ret;
297}
298