//===-- Main entry into the loader interface ------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This utility is used to launch standard programs onto the GPU in conjunction // with the LLVM 'libc' project. It is designed to mimic a standard emulator // workflow, allowing for unit tests to be run on the GPU directly. // //===----------------------------------------------------------------------===// #include "llvm-gpu-loader.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/Object/ELF.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/Signals.h" #include "llvm/Support/WithColor.h" #include "llvm/TargetParser/Triple.h" #include #include #include #include #include using namespace llvm; static cl::OptionCategory LoaderCategory("loader options"); static cl::opt Help("h", cl::desc("Alias for -help"), cl::Hidden, cl::cat(LoaderCategory)); static cl::opt ThreadsX("threads-x", cl::desc("Number of threads in the 'x' dimension"), cl::init(1), cl::cat(LoaderCategory)); static cl::opt ThreadsY("threads-y", cl::desc("Number of threads in the 'y' dimension"), cl::init(1), cl::cat(LoaderCategory)); static cl::opt ThreadsZ("threads-z", cl::desc("Number of threads in the 'z' dimension"), cl::init(1), cl::cat(LoaderCategory)); static cl::alias threads("threads", cl::aliasopt(ThreadsX), cl::desc("Alias for --threads-x"), cl::cat(LoaderCategory)); static cl::opt BlocksX("blocks-x", cl::desc("Number of blocks in the 'x' dimension"), cl::init(1), cl::cat(LoaderCategory)); static cl::opt BlocksY("blocks-y", cl::desc("Number of blocks in the 'y' dimension"), cl::init(1), cl::cat(LoaderCategory)); static cl::opt BlocksZ("blocks-z", cl::desc("Number of blocks in the 'z' dimension"), cl::init(1), cl::cat(LoaderCategory)); static cl::alias Blocks("blocks", cl::aliasopt(BlocksX), cl::desc("Alias for --blocks-x"), cl::cat(LoaderCategory)); static cl::opt File(cl::Positional, cl::Required, cl::desc(""), cl::cat(LoaderCategory)); static cl::list Args(cl::ConsumeAfter, cl::desc("..."), cl::cat(LoaderCategory)); // The arguments to the '_begin' kernel. struct BeginArgs { int Argc; void *Argv; void *Envp; }; // The arguments to the '_start' kernel. struct StartArgs { int Argc; void *Argv; void *Envp; void *Ret; }; // The arguments to the '_end' kernel. struct EndArgs {}; [[noreturn]] static void handleError(Error E) { outs().flush(); logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader")); exit(EXIT_FAILURE); } [[noreturn]] static void handleError(ol_result_t Err, unsigned Line) { fprintf(stderr, "%s:%d %s\n", __FILE__, Line, Err->Details); exit(EXIT_FAILURE); } #define OFFLOAD_ERR(X) \ if (ol_result_t Err = X) \ handleError(Err, __LINE__); static void *copyArgumentVector(int Argc, const char **Argv, ol_device_handle_t Device) { size_t ArgSize = sizeof(char *) * (Argc + 1); size_t StringLen = 0; for (int i = 0; i < Argc; ++i) StringLen += strlen(Argv[i]) + 1; // We allocate enough space for a null terminated array and all the strings. void *DevArgv; OFFLOAD_ERR( olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv)); if (!DevArgv) handleError( createStringError("Failed to allocate memory for environment.")); // Store the strings linerally in the same memory buffer. void *DevString = reinterpret_cast(DevArgv) + ArgSize; for (int i = 0; i < Argc; ++i) { size_t size = strlen(Argv[i]) + 1; std::memcpy(DevString, Argv[i], size); static_cast(DevArgv)[i] = DevString; DevString = reinterpret_cast(DevString) + size; } // Ensure the vector is null terminated. reinterpret_cast(DevArgv)[Argc] = nullptr; return DevArgv; } void *copyEnvironment(const char **Envp, ol_device_handle_t Device) { int Envc = 0; for (const char **Env = Envp; *Env != 0; ++Env) ++Envc; return copyArgumentVector(Envc, Envp, Device); } ol_device_handle_t findDevice(MemoryBufferRef Binary) { ol_device_handle_t Device = nullptr; std::tuple Data = std::make_tuple(&Device, &Binary); OFFLOAD_ERR(olIterateDevices( [](ol_device_handle_t Device, void *UserData) { auto &[Output, Binary] = *reinterpret_cast(UserData); bool IsValid = false; OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(), Binary->getBufferSize(), &IsValid)); if (!IsValid) return true; *Output = Device; return false; }, &Data)); return Device; } ol_device_handle_t getHostDevice() { ol_device_handle_t Device; OFFLOAD_ERR(olIterateDevices( [](ol_device_handle_t Device, void *UserData) { ol_platform_handle_t Platform; olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), &Platform); ol_platform_backend_t Backend; olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend), &Backend); auto &Output = *reinterpret_cast(UserData); if (Backend == OL_PLATFORM_BACKEND_HOST) { Output = Device; return false; } return true; }, &Device)); return Device; } template void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device, ol_program_handle_t Program, const char *Name, ol_kernel_launch_size_args_t LaunchArgs, Args &KernelArgs) { ol_symbol_handle_t Kernel; OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel)); OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &KernelArgs, std::is_empty_v ? 0 : sizeof(Args), &LaunchArgs)); } int main(int argc, const char **argv, const char **envp) { sys::PrintStackTraceOnErrorSignal(argv[0]); cl::HideUnrelatedOptions(LoaderCategory); cl::ParseCommandLineOptions( argc, argv, "A utility used to launch unit tests built for a GPU target. This is\n" "intended to provide an interface similar to cross-compiling " "emulators\n"); if (Help) { cl::PrintHelpMessage(); return EXIT_SUCCESS; } if (Error Err = loadLLVMOffload()) handleError(std::move(Err)); ErrorOr> ImageOrErr = MemoryBuffer::getFileOrSTDIN(File); if (std::error_code EC = ImageOrErr.getError()) handleError(errorCodeToError(EC)); MemoryBufferRef Image = **ImageOrErr; ol_platform_backend_t Backend; ol_init_args_t InitArgs = OL_INIT_ARGS_INIT; file_magic Magic = identify_magic(Image.getBuffer()); if (Magic >= file_magic::elf && Magic <= file_magic::elf_core) { Expected> ElfOrErr = object::ELFFile::create(Image.getBuffer()); if (!ElfOrErr) handleError(ElfOrErr.takeError()); switch (ElfOrErr->getHeader().e_machine) { case ELF::EM_AMDGPU: Backend = OL_PLATFORM_BACKEND_AMDGPU; break; case ELF::EM_CUDA: Backend = OL_PLATFORM_BACKEND_CUDA; break; default: handleError(createStringError( "unhandled ELF architecture: %s", ELF::convertEMachineToArchName(ElfOrErr->getHeader().e_machine) .data())); } InitArgs.NumPlatforms = 1; InitArgs.Platforms = &Backend; } SmallVector NewArgv = {File.c_str()}; llvm::transform(Args, std::back_inserter(NewArgv), [](const std::string &Arg) { return Arg.c_str(); }); OFFLOAD_ERR(olInit(&InitArgs)); ol_device_handle_t Device = findDevice(Image); if (!Device) handleError(createStringError("No compatible device was found")); ol_device_handle_t Host = getHostDevice(); assert(Host && "Host device should always be present"); ol_program_handle_t Program; OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(), Image.getBufferSize(), &Program)); ol_queue_handle_t Queue; OFFLOAD_ERR(olCreateQueue(Device, &Queue)); int DevArgc = static_cast(NewArgv.size()); void *DevArgv = copyArgumentVector(NewArgv.size(), NewArgv.begin(), Device); void *DevEnvp = copyEnvironment(envp, Device); void *DevRet; int Zero = 0; OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet)); OFFLOAD_ERR(olMemcpy(Queue, DevRet, Device, &Zero, Host, sizeof(int))); ol_kernel_launch_size_args_t BeginLaunch{1, {1, 1, 1}, {1, 1, 1}, 0}; BeginArgs BeginArgs = {DevArgc, DevArgv, DevEnvp}; launchKernel(Queue, Device, Program, "_begin", BeginLaunch, BeginArgs); OFFLOAD_ERR(olSyncQueue(Queue)); uint32_t Dims = (BlocksZ > 1) ? 3 : (BlocksY > 1) ? 2 : 1; ol_kernel_launch_size_args_t StartLaunch{Dims, {BlocksX, BlocksY, BlocksZ}, {ThreadsX, ThreadsY, ThreadsZ}, /*SharedMemBytes=*/0}; StartArgs StartArgs = {DevArgc, DevArgv, DevEnvp, DevRet}; launchKernel(Queue, Device, Program, "_start", StartLaunch, StartArgs); ol_kernel_launch_size_args_t EndLaunch{1, {1, 1, 1}, {1, 1, 1}, 0}; EndArgs EndArgs = {}; launchKernel(Queue, Device, Program, "_end", EndLaunch, EndArgs); int Ret; OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int))); OFFLOAD_ERR(olSyncQueue(Queue)); OFFLOAD_ERR(olMemFree(DevRet)); OFFLOAD_ERR(olMemFree(DevArgv)); OFFLOAD_ERR(olMemFree(DevEnvp)); OFFLOAD_ERR(olDestroyQueue(Queue)); OFFLOAD_ERR(olDestroyProgram(Program)); OFFLOAD_ERR(olShutDown()); return Ret; }