1//===-- Main entry into the loader interface ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This utility is used to launch standard programs onto the GPU in conjunction
10// with the LLVM 'libc' project. It is designed to mimic a standard emulator
11// workflow, allowing for unit tests to be run on the GPU directly.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm-gpu-loader.h"
16
17#include "llvm/BinaryFormat/Magic.h"
18#include "llvm/Object/ELF.h"
19#include "llvm/Object/ELFObjectFile.h"
20#include "llvm/Support/CommandLine.h"
21#include "llvm/Support/Error.h"
22#include "llvm/Support/FileSystem.h"
23#include "llvm/Support/MemoryBuffer.h"
24#include "llvm/Support/Path.h"
25#include "llvm/Support/Signals.h"
26#include "llvm/Support/WithColor.h"
27#include "llvm/TargetParser/Triple.h"
28
29#include <cerrno>
30#include <cstdio>
31#include <cstdlib>
32#include <cstring>
33#include <string>
34
35using namespace llvm;
36
37static cl::OptionCategory LoaderCategory("loader options");
38
39static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden,
40 cl::cat(LoaderCategory));
41
42static cl::opt<unsigned>
43 ThreadsX("threads-x", cl::desc("Number of threads in the 'x' dimension"),
44 cl::init(Val: 1), cl::cat(LoaderCategory));
45static cl::opt<unsigned>
46 ThreadsY("threads-y", cl::desc("Number of threads in the 'y' dimension"),
47 cl::init(Val: 1), cl::cat(LoaderCategory));
48static cl::opt<unsigned>
49 ThreadsZ("threads-z", cl::desc("Number of threads in the 'z' dimension"),
50 cl::init(Val: 1), cl::cat(LoaderCategory));
51static cl::alias threads("threads", cl::aliasopt(ThreadsX),
52 cl::desc("Alias for --threads-x"),
53 cl::cat(LoaderCategory));
54
55static cl::opt<unsigned>
56 BlocksX("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
57 cl::init(Val: 1), cl::cat(LoaderCategory));
58static cl::opt<unsigned>
59 BlocksY("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
60 cl::init(Val: 1), cl::cat(LoaderCategory));
61static cl::opt<unsigned>
62 BlocksZ("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
63 cl::init(Val: 1), cl::cat(LoaderCategory));
64static cl::alias Blocks("blocks", cl::aliasopt(BlocksX),
65 cl::desc("Alias for --blocks-x"),
66 cl::cat(LoaderCategory));
67
68static cl::opt<std::string> File(cl::Positional, cl::Required,
69 cl::desc("<gpu executable>"),
70 cl::cat(LoaderCategory));
71static cl::list<std::string> Args(cl::ConsumeAfter,
72 cl::desc("<program arguments>..."),
73 cl::cat(LoaderCategory));
74
75[[noreturn]] static void handleError(Error E) {
76 outs().flush();
77 logAllUnhandledErrors(E: std::move(E), OS&: WithColor::error(OS&: errs(), Prefix: "loader"));
78 exit(EXIT_FAILURE);
79}
80
81[[noreturn]] static void handleError(ol_result_t Err, unsigned Line) {
82 fprintf(stderr, format: "%s:%d %s\n", __FILE__, Line, Err->Details);
83 exit(EXIT_FAILURE);
84}
85
86#define OFFLOAD_ERR(X) \
87 if (ol_result_t Err = X) \
88 handleError(Err, __LINE__);
89
90static void *copyArgumentVector(int Argc, const char **Argv,
91 ol_device_handle_t Device) {
92 size_t ArgSize = sizeof(char *) * (Argc + 1);
93 size_t StringLen = 0;
94 for (int i = 0; i < Argc; ++i)
95 StringLen += strlen(s: Argv[i]) + 1;
96
97 // We allocate enough space for a null terminated array and all the strings.
98 void *DevArgv;
99 OFFLOAD_ERR(
100 olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv));
101 if (!DevArgv)
102 handleError(
103 E: createStringError(Fmt: "Failed to allocate memory for environment."));
104
105 // Store the strings linerally in the same memory buffer.
106 void *DevString = reinterpret_cast<uint8_t *>(DevArgv) + ArgSize;
107 for (int i = 0; i < Argc; ++i) {
108 size_t size = strlen(s: Argv[i]) + 1;
109 std::memcpy(dest: DevString, src: Argv[i], n: size);
110 static_cast<void **>(DevArgv)[i] = DevString;
111 DevString = reinterpret_cast<uint8_t *>(DevString) + size;
112 }
113
114 // Ensure the vector is null terminated.
115 reinterpret_cast<void **>(DevArgv)[Argc] = nullptr;
116 return DevArgv;
117}
118
119void *copyEnvironment(const char **Envp, ol_device_handle_t Device) {
120 int Envc = 0;
121 for (const char **Env = Envp; *Env != 0; ++Env)
122 ++Envc;
123
124 return copyArgumentVector(Argc: Envc, Argv: Envp, Device);
125}
126
127ol_device_handle_t findDevice(MemoryBufferRef Binary) {
128 ol_device_handle_t Device = nullptr;
129 std::tuple Data = std::make_tuple(args: &Device, args: &Binary);
130 OFFLOAD_ERR(olIterateDevices(
131 [](ol_device_handle_t Device, void *UserData) {
132 auto &[Output, Binary] = *reinterpret_cast<decltype(Data) *>(UserData);
133 bool IsValid = false;
134 OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(),
135 Binary->getBufferSize(), &IsValid));
136 if (!IsValid)
137 return true;
138
139 *Output = Device;
140 return false;
141 },
142 &Data));
143 return Device;
144}
145
146ol_device_handle_t getHostDevice() {
147 ol_device_handle_t Device;
148 OFFLOAD_ERR(olIterateDevices(
149 [](ol_device_handle_t Device, void *UserData) {
150 ol_platform_handle_t Platform;
151 olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
152 &Platform);
153 ol_platform_backend_t Backend;
154 olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend),
155 &Backend);
156
157 auto &Output = *reinterpret_cast<decltype(Device) *>(UserData);
158 if (Backend == OL_PLATFORM_BACKEND_HOST) {
159 Output = Device;
160 return false;
161 }
162 return true;
163 },
164 &Device));
165 return Device;
166}
167
168template <typename... Args>
169void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device,
170 ol_program_handle_t Program, const char *Name,
171 ol_kernel_launch_size_args_t LaunchArgs,
172 Args &...KernelArgs) {
173 ol_symbol_handle_t Kernel;
174 OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel));
175
176 if constexpr (sizeof...(Args) == 0) {
177 OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &LaunchArgs, nullptr, 0,
178 nullptr, nullptr));
179 } else {
180 void *ArgPtrs[] = {static_cast<void *>(&KernelArgs)...};
181 size_t ArgSizes[] = {sizeof(KernelArgs)...};
182 OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &LaunchArgs, nullptr,
183 sizeof...(Args), ArgPtrs, ArgSizes));
184 }
185}
186
187int main(int argc, const char **argv, const char **envp) {
188 sys::PrintStackTraceOnErrorSignal(Argv0: argv[0]);
189 cl::HideUnrelatedOptions(Category&: LoaderCategory);
190 cl::ParseCommandLineOptions(
191 argc, argv,
192 Overview: "A utility used to launch unit tests built for a GPU target. This is\n"
193 "intended to provide an interface similar to cross-compiling "
194 "emulators\n");
195
196 if (Help) {
197 cl::PrintHelpMessage();
198 return EXIT_SUCCESS;
199 }
200
201 if (Error Err = loadLLVMOffload())
202 handleError(E: std::move(Err));
203
204 ErrorOr<std::unique_ptr<MemoryBuffer>> ImageOrErr =
205 MemoryBuffer::getFileOrSTDIN(Filename: File);
206 if (std::error_code EC = ImageOrErr.getError())
207 handleError(E: errorCodeToError(EC));
208 MemoryBufferRef Image = **ImageOrErr;
209
210 ol_platform_backend_t Backend;
211 ol_init_args_t InitArgs = OL_INIT_ARGS_INIT;
212
213 file_magic Magic = identify_magic(magic: Image.getBuffer());
214 if (Magic >= file_magic::elf && Magic <= file_magic::elf_core) {
215 Expected<object::ELFFile<object::ELF64LE>> ElfOrErr =
216 object::ELFFile<object::ELF64LE>::create(Object: Image.getBuffer());
217 if (!ElfOrErr)
218 handleError(E: ElfOrErr.takeError());
219
220 switch (ElfOrErr->getHeader().e_machine) {
221 case ELF::EM_AMDGPU:
222 Backend = OL_PLATFORM_BACKEND_AMDGPU;
223 break;
224 case ELF::EM_CUDA:
225 Backend = OL_PLATFORM_BACKEND_CUDA;
226 break;
227 default:
228 handleError(E: createStringError(
229 Fmt: "unhandled ELF architecture: %s",
230 Vals: ELF::convertEMachineToArchName(EMachine: ElfOrErr->getHeader().e_machine)
231 .data()));
232 }
233 InitArgs.NumPlatforms = 1;
234 InitArgs.Platforms = &Backend;
235 }
236
237 SmallVector<const char *> NewArgv = {File.c_str()};
238 llvm::transform(Range&: Args, d_first: std::back_inserter(x&: NewArgv),
239 F: [](const std::string &Arg) { return Arg.c_str(); });
240
241 OFFLOAD_ERR(olInit(&InitArgs));
242 ol_device_handle_t Device = findDevice(Binary: Image);
243 if (!Device)
244 handleError(E: createStringError(Fmt: "No compatible device was found"));
245 ol_device_handle_t Host = getHostDevice();
246 assert(Host && "Host device should always be present");
247
248 ol_program_handle_t Program;
249 OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(),
250 Image.getBufferSize(), &Program));
251
252 ol_queue_handle_t Queue;
253 OFFLOAD_ERR(olCreateQueue(Device, &Queue));
254
255 int DevArgc = static_cast<int>(NewArgv.size());
256 void *DevArgv = copyArgumentVector(Argc: NewArgv.size(), Argv: NewArgv.begin(), Device);
257 void *DevEnvp = copyEnvironment(Envp: envp, Device);
258
259 void *DevRet;
260 int Zero = 0;
261 OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet));
262 OFFLOAD_ERR(olMemcpy(Queue, DevRet, Device, &Zero, Host, sizeof(int)));
263
264 ol_kernel_launch_size_args_t BeginLaunch{.Dimensions: 1, .NumGroups: {.x: 1, .y: 1, .z: 1}, .GroupSize: {.x: 1, .y: 1, .z: 1}, .DynSharedMemory: 0};
265 launchKernel(Queue, Device, Program, Name: "_begin", LaunchArgs: BeginLaunch, KernelArgs&: DevArgc, KernelArgs&: DevArgv,
266 KernelArgs&: DevEnvp);
267 OFFLOAD_ERR(olSyncQueue(Queue));
268
269 uint32_t Dims = (BlocksZ > 1) ? 3 : (BlocksY > 1) ? 2 : 1;
270 ol_kernel_launch_size_args_t StartLaunch{.Dimensions: Dims,
271 .NumGroups: {.x: BlocksX, .y: BlocksY, .z: BlocksZ},
272 .GroupSize: {.x: ThreadsX, .y: ThreadsY, .z: ThreadsZ},
273 /*SharedMemBytes=*/.DynSharedMemory: 0};
274 launchKernel(Queue, Device, Program, Name: "_start", LaunchArgs: StartLaunch, KernelArgs&: DevArgc, KernelArgs&: DevArgv,
275 KernelArgs&: DevEnvp, KernelArgs&: DevRet);
276
277 ol_kernel_launch_size_args_t EndLaunch{.Dimensions: 1, .NumGroups: {.x: 1, .y: 1, .z: 1}, .GroupSize: {.x: 1, .y: 1, .z: 1}, .DynSharedMemory: 0};
278 launchKernel(Queue, Device, Program, Name: "_end", LaunchArgs: EndLaunch);
279
280 int Ret;
281 OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int)));
282 OFFLOAD_ERR(olSyncQueue(Queue));
283
284 OFFLOAD_ERR(olMemFree(DevRet));
285 OFFLOAD_ERR(olMemFree(DevArgv));
286 OFFLOAD_ERR(olMemFree(DevEnvp));
287 OFFLOAD_ERR(olDestroyQueue(Queue));
288 OFFLOAD_ERR(olDestroyProgram(Program));
289 OFFLOAD_ERR(olShutDown());
290
291 return Ret;
292}
293