| 1 | //===-- Main entry into the loader interface ------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This utility is used to launch standard programs onto the GPU in conjunction |
| 10 | // with the LLVM 'libc' project. It is designed to mimic a standard emulator |
| 11 | // workflow, allowing for unit tests to be run on the GPU directly. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "llvm-gpu-loader.h" |
| 16 | |
| 17 | #include "llvm/BinaryFormat/Magic.h" |
| 18 | #include "llvm/Object/ELF.h" |
| 19 | #include "llvm/Object/ELFObjectFile.h" |
| 20 | #include "llvm/Support/CommandLine.h" |
| 21 | #include "llvm/Support/Error.h" |
| 22 | #include "llvm/Support/FileSystem.h" |
| 23 | #include "llvm/Support/MemoryBuffer.h" |
| 24 | #include "llvm/Support/Path.h" |
| 25 | #include "llvm/Support/Signals.h" |
| 26 | #include "llvm/Support/WithColor.h" |
| 27 | #include "llvm/TargetParser/Triple.h" |
| 28 | |
| 29 | #include <cerrno> |
| 30 | #include <cstdio> |
| 31 | #include <cstdlib> |
| 32 | #include <cstring> |
| 33 | #include <string> |
| 34 | |
| 35 | using namespace llvm; |
| 36 | |
| 37 | static cl::OptionCategory LoaderCategory("loader options" ); |
| 38 | |
| 39 | static cl::opt<bool> Help("h" , cl::desc("Alias for -help" ), cl::Hidden, |
| 40 | cl::cat(LoaderCategory)); |
| 41 | |
| 42 | static cl::opt<unsigned> |
| 43 | ThreadsX("threads-x" , cl::desc("Number of threads in the 'x' dimension" ), |
| 44 | cl::init(Val: 1), cl::cat(LoaderCategory)); |
| 45 | static cl::opt<unsigned> |
| 46 | ThreadsY("threads-y" , cl::desc("Number of threads in the 'y' dimension" ), |
| 47 | cl::init(Val: 1), cl::cat(LoaderCategory)); |
| 48 | static cl::opt<unsigned> |
| 49 | ThreadsZ("threads-z" , cl::desc("Number of threads in the 'z' dimension" ), |
| 50 | cl::init(Val: 1), cl::cat(LoaderCategory)); |
| 51 | static cl::alias threads("threads" , cl::aliasopt(ThreadsX), |
| 52 | cl::desc("Alias for --threads-x" ), |
| 53 | cl::cat(LoaderCategory)); |
| 54 | |
| 55 | static cl::opt<unsigned> |
| 56 | BlocksX("blocks-x" , cl::desc("Number of blocks in the 'x' dimension" ), |
| 57 | cl::init(Val: 1), cl::cat(LoaderCategory)); |
| 58 | static cl::opt<unsigned> |
| 59 | BlocksY("blocks-y" , cl::desc("Number of blocks in the 'y' dimension" ), |
| 60 | cl::init(Val: 1), cl::cat(LoaderCategory)); |
| 61 | static cl::opt<unsigned> |
| 62 | BlocksZ("blocks-z" , cl::desc("Number of blocks in the 'z' dimension" ), |
| 63 | cl::init(Val: 1), cl::cat(LoaderCategory)); |
| 64 | static cl::alias Blocks("blocks" , cl::aliasopt(BlocksX), |
| 65 | cl::desc("Alias for --blocks-x" ), |
| 66 | cl::cat(LoaderCategory)); |
| 67 | |
| 68 | static cl::opt<std::string> File(cl::Positional, cl::Required, |
| 69 | cl::desc("<gpu executable>" ), |
| 70 | cl::cat(LoaderCategory)); |
| 71 | static cl::list<std::string> Args(cl::ConsumeAfter, |
| 72 | cl::desc("<program arguments>..." ), |
| 73 | cl::cat(LoaderCategory)); |
| 74 | |
| 75 | // The arguments to the '_begin' kernel. |
| 76 | struct BeginArgs { |
| 77 | int Argc; |
| 78 | void *Argv; |
| 79 | void *Envp; |
| 80 | }; |
| 81 | |
| 82 | // The arguments to the '_start' kernel. |
| 83 | struct StartArgs { |
| 84 | int Argc; |
| 85 | void *Argv; |
| 86 | void *Envp; |
| 87 | void *Ret; |
| 88 | }; |
| 89 | |
| 90 | // The arguments to the '_end' kernel. |
| 91 | struct EndArgs {}; |
| 92 | |
| 93 | [[noreturn]] static void handleError(Error E) { |
| 94 | outs().flush(); |
| 95 | logAllUnhandledErrors(E: std::move(E), OS&: WithColor::error(OS&: errs(), Prefix: "loader" )); |
| 96 | exit(EXIT_FAILURE); |
| 97 | } |
| 98 | |
| 99 | [[noreturn]] static void handleError(ol_result_t Err, unsigned Line) { |
| 100 | fprintf(stderr, format: "%s:%d %s\n" , __FILE__, Line, Err->Details); |
| 101 | exit(EXIT_FAILURE); |
| 102 | } |
| 103 | |
| 104 | #define OFFLOAD_ERR(X) \ |
| 105 | if (ol_result_t Err = X) \ |
| 106 | handleError(Err, __LINE__); |
| 107 | |
| 108 | static void *copyArgumentVector(int Argc, const char **Argv, |
| 109 | ol_device_handle_t Device) { |
| 110 | size_t ArgSize = sizeof(char *) * (Argc + 1); |
| 111 | size_t StringLen = 0; |
| 112 | for (int i = 0; i < Argc; ++i) |
| 113 | StringLen += strlen(s: Argv[i]) + 1; |
| 114 | |
| 115 | // We allocate enough space for a null terminated array and all the strings. |
| 116 | void *DevArgv; |
| 117 | OFFLOAD_ERR( |
| 118 | olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv)); |
| 119 | if (!DevArgv) |
| 120 | handleError( |
| 121 | E: createStringError(Fmt: "Failed to allocate memory for environment." )); |
| 122 | |
| 123 | // Store the strings linerally in the same memory buffer. |
| 124 | void *DevString = reinterpret_cast<uint8_t *>(DevArgv) + ArgSize; |
| 125 | for (int i = 0; i < Argc; ++i) { |
| 126 | size_t size = strlen(s: Argv[i]) + 1; |
| 127 | std::memcpy(dest: DevString, src: Argv[i], n: size); |
| 128 | static_cast<void **>(DevArgv)[i] = DevString; |
| 129 | DevString = reinterpret_cast<uint8_t *>(DevString) + size; |
| 130 | } |
| 131 | |
| 132 | // Ensure the vector is null terminated. |
| 133 | reinterpret_cast<void **>(DevArgv)[Argc] = nullptr; |
| 134 | return DevArgv; |
| 135 | } |
| 136 | |
| 137 | void *copyEnvironment(const char **Envp, ol_device_handle_t Device) { |
| 138 | int Envc = 0; |
| 139 | for (const char **Env = Envp; *Env != 0; ++Env) |
| 140 | ++Envc; |
| 141 | |
| 142 | return copyArgumentVector(Argc: Envc, Argv: Envp, Device); |
| 143 | } |
| 144 | |
| 145 | ol_device_handle_t findDevice(MemoryBufferRef Binary) { |
| 146 | ol_device_handle_t Device; |
| 147 | std::tuple Data = std::make_tuple(args: &Device, args: &Binary); |
| 148 | OFFLOAD_ERR(olIterateDevices( |
| 149 | [](ol_device_handle_t Device, void *UserData) { |
| 150 | auto &[Output, Binary] = *reinterpret_cast<decltype(Data) *>(UserData); |
| 151 | bool IsValid = false; |
| 152 | OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(), |
| 153 | Binary->getBufferSize(), &IsValid)); |
| 154 | if (!IsValid) |
| 155 | return true; |
| 156 | |
| 157 | *Output = Device; |
| 158 | return false; |
| 159 | }, |
| 160 | &Data)); |
| 161 | return Device; |
| 162 | } |
| 163 | |
| 164 | ol_device_handle_t getHostDevice() { |
| 165 | ol_device_handle_t Device; |
| 166 | OFFLOAD_ERR(olIterateDevices( |
| 167 | [](ol_device_handle_t Device, void *UserData) { |
| 168 | ol_platform_handle_t Platform; |
| 169 | olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), |
| 170 | &Platform); |
| 171 | ol_platform_backend_t Backend; |
| 172 | olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend), |
| 173 | &Backend); |
| 174 | |
| 175 | auto &Output = *reinterpret_cast<decltype(Device) *>(UserData); |
| 176 | if (Backend == OL_PLATFORM_BACKEND_HOST) { |
| 177 | Output = Device; |
| 178 | return false; |
| 179 | } |
| 180 | return true; |
| 181 | }, |
| 182 | &Device)); |
| 183 | return Device; |
| 184 | } |
| 185 | |
| 186 | template <typename Args> |
| 187 | void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device, |
| 188 | ol_program_handle_t Program, const char *Name, |
| 189 | ol_kernel_launch_size_args_t LaunchArgs, Args &KernelArgs) { |
| 190 | ol_symbol_handle_t Kernel; |
| 191 | OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel)); |
| 192 | |
| 193 | OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &KernelArgs, |
| 194 | std::is_empty_v<Args> ? 0 : sizeof(Args), |
| 195 | &LaunchArgs)); |
| 196 | } |
| 197 | |
| 198 | int main(int argc, const char **argv, const char **envp) { |
| 199 | sys::PrintStackTraceOnErrorSignal(Argv0: argv[0]); |
| 200 | cl::HideUnrelatedOptions(Category&: LoaderCategory); |
| 201 | cl::ParseCommandLineOptions( |
| 202 | argc, argv, |
| 203 | Overview: "A utility used to launch unit tests built for a GPU target. This is\n" |
| 204 | "intended to provide an intrface simular to cross-compiling emulators\n" ); |
| 205 | |
| 206 | if (Help) { |
| 207 | cl::PrintHelpMessage(); |
| 208 | return EXIT_SUCCESS; |
| 209 | } |
| 210 | |
| 211 | if (Error Err = loadLLVMOffload()) |
| 212 | handleError(E: std::move(Err)); |
| 213 | |
| 214 | ErrorOr<std::unique_ptr<MemoryBuffer>> ImageOrErr = |
| 215 | MemoryBuffer::getFileOrSTDIN(Filename: File); |
| 216 | if (std::error_code EC = ImageOrErr.getError()) |
| 217 | handleError(E: errorCodeToError(EC)); |
| 218 | MemoryBufferRef Image = **ImageOrErr; |
| 219 | |
| 220 | ol_platform_backend_t Backend; |
| 221 | ol_init_args_t InitArgs = OL_INIT_ARGS_INIT; |
| 222 | |
| 223 | file_magic Magic = identify_magic(magic: Image.getBuffer()); |
| 224 | if (Magic >= file_magic::elf && Magic <= file_magic::elf_core) { |
| 225 | Expected<object::ELFFile<object::ELF64LE>> ElfOrErr = |
| 226 | object::ELFFile<object::ELF64LE>::create(Object: Image.getBuffer()); |
| 227 | if (!ElfOrErr) |
| 228 | handleError(E: ElfOrErr.takeError()); |
| 229 | |
| 230 | switch (ElfOrErr->getHeader().e_machine) { |
| 231 | case ELF::EM_AMDGPU: |
| 232 | Backend = OL_PLATFORM_BACKEND_AMDGPU; |
| 233 | break; |
| 234 | case ELF::EM_CUDA: |
| 235 | Backend = OL_PLATFORM_BACKEND_CUDA; |
| 236 | break; |
| 237 | default: |
| 238 | handleError(E: createStringError( |
| 239 | Fmt: "unhandled ELF architecture: %s" , |
| 240 | Vals: ELF::convertEMachineToArchName(EMachine: ElfOrErr->getHeader().e_machine) |
| 241 | .data())); |
| 242 | } |
| 243 | InitArgs.NumPlatforms = 1; |
| 244 | InitArgs.Platforms = &Backend; |
| 245 | } |
| 246 | |
| 247 | SmallVector<const char *> NewArgv = {File.c_str()}; |
| 248 | llvm::transform(Range&: Args, d_first: std::back_inserter(x&: NewArgv), |
| 249 | F: [](const std::string &Arg) { return Arg.c_str(); }); |
| 250 | |
| 251 | OFFLOAD_ERR(olInit(&InitArgs)); |
| 252 | ol_device_handle_t Device = findDevice(Binary: Image); |
| 253 | ol_device_handle_t Host = getHostDevice(); |
| 254 | |
| 255 | ol_program_handle_t Program; |
| 256 | OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(), |
| 257 | Image.getBufferSize(), &Program)); |
| 258 | |
| 259 | ol_queue_handle_t Queue; |
| 260 | OFFLOAD_ERR(olCreateQueue(Device, &Queue)); |
| 261 | |
| 262 | int DevArgc = static_cast<int>(NewArgv.size()); |
| 263 | void *DevArgv = copyArgumentVector(Argc: NewArgv.size(), Argv: NewArgv.begin(), Device); |
| 264 | void *DevEnvp = copyEnvironment(Envp: envp, Device); |
| 265 | |
| 266 | void *DevRet; |
| 267 | OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet)); |
| 268 | |
| 269 | ol_kernel_launch_size_args_t BeginLaunch{.Dimensions: 1, .NumGroups: {.x: 1, .y: 1, .z: 1}, .GroupSize: {.x: 1, .y: 1, .z: 1}, .DynSharedMemory: 0}; |
| 270 | BeginArgs BeginArgs = {.Argc: DevArgc, .Argv: DevArgv, .Envp: DevEnvp}; |
| 271 | launchKernel(Queue, Device, Program, Name: "_begin" , LaunchArgs: BeginLaunch, KernelArgs&: BeginArgs); |
| 272 | OFFLOAD_ERR(olSyncQueue(Queue)); |
| 273 | |
| 274 | uint32_t Dims = (BlocksZ > 1) ? 3 : (BlocksY > 1) ? 2 : 1; |
| 275 | ol_kernel_launch_size_args_t StartLaunch{.Dimensions: Dims, |
| 276 | .NumGroups: {.x: BlocksX, .y: BlocksY, .z: BlocksZ}, |
| 277 | .GroupSize: {.x: ThreadsX, .y: ThreadsY, .z: ThreadsZ}, |
| 278 | /*SharedMemBytes=*/.DynSharedMemory: 0}; |
| 279 | StartArgs StartArgs = {.Argc: DevArgc, .Argv: DevArgv, .Envp: DevEnvp, .Ret: DevRet}; |
| 280 | launchKernel(Queue, Device, Program, Name: "_start" , LaunchArgs: StartLaunch, KernelArgs&: StartArgs); |
| 281 | |
| 282 | ol_kernel_launch_size_args_t EndLaunch{.Dimensions: 1, .NumGroups: {.x: 1, .y: 1, .z: 1}, .GroupSize: {.x: 1, .y: 1, .z: 1}, .DynSharedMemory: 0}; |
| 283 | EndArgs EndArgs = {}; |
| 284 | launchKernel(Queue, Device, Program, Name: "_end" , LaunchArgs: EndLaunch, KernelArgs&: EndArgs); |
| 285 | |
| 286 | int Ret; |
| 287 | OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int))); |
| 288 | OFFLOAD_ERR(olSyncQueue(Queue)); |
| 289 | |
| 290 | OFFLOAD_ERR(olMemFree(DevArgv)); |
| 291 | OFFLOAD_ERR(olMemFree(DevEnvp)); |
| 292 | OFFLOAD_ERR(olDestroyQueue(Queue)); |
| 293 | OFFLOAD_ERR(olDestroyProgram(Program)); |
| 294 | OFFLOAD_ERR(olShutDown()); |
| 295 | |
| 296 | return Ret; |
| 297 | } |
| 298 | |