Add a new nextgen plugin that supports GPU devices through the Intel oneAPI Level Zero library. The plugin is not enabled by default and needs to be added to LIBOMPTARGET_PLUGINS_TO_BUILD explicitely. --------- Co-authored-by: Alexey Sachkov <alexey.sachkov@intel.com> Co-authored-by: Nick Sarnie <nick.sarnie@intel.com> Co-authored-by: Joseph Huber <huberjn@outlook.com>
167 lines
6.0 KiB
C++
167 lines
6.0 KiB
C++
//===--- Level Zero Target RTL Implementation -----------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Level Zero RTL Options support.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
|
|
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
|
|
|
|
#include <level_zero/ze_api.h>
|
|
|
|
#include "Shared/EnvironmentVar.h"
|
|
|
|
#include "L0Defs.h"
|
|
|
|
namespace llvm::omp::target::plugin {
|
|
/// Command submission mode.
|
|
enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };
|
|
|
|
/// Specialization constants used for a module compilation.
|
|
class SpecConstantsTy {
|
|
std::vector<uint32_t> ConstantIds;
|
|
std::vector<const void *> ConstantValues;
|
|
BumpPtrAllocator &Allocator;
|
|
|
|
public:
|
|
SpecConstantsTy(BumpPtrAllocator &Allocator) : Allocator(Allocator) {}
|
|
SpecConstantsTy(const SpecConstantsTy &) = delete;
|
|
SpecConstantsTy(SpecConstantsTy &&) = delete;
|
|
SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
|
|
SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
|
|
SpecConstantsTy(const SpecConstantsTy &&Other)
|
|
: ConstantIds(std::move(Other.ConstantIds)),
|
|
ConstantValues(std::move(Other.ConstantValues)),
|
|
Allocator(Other.Allocator) {}
|
|
~SpecConstantsTy() = default;
|
|
|
|
template <typename T> void addConstant(uint32_t Id, T Val) {
|
|
T *ValuePtr =
|
|
reinterpret_cast<T *>(Allocator.Allocate(sizeof(T), alignof(T)));
|
|
*ValuePtr = Val;
|
|
|
|
ConstantIds.push_back(Id);
|
|
ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
|
|
}
|
|
|
|
ze_module_constants_t getModuleConstants() const {
|
|
ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
|
|
ConstantIds.data(),
|
|
// Unfortunately we have to const_cast it.
|
|
// L0 data type should probably be fixed.
|
|
const_cast<const void **>(ConstantValues.data())};
|
|
return Tmp;
|
|
}
|
|
};
|
|
|
|
/// L0 Plugin flags.
|
|
struct L0OptionFlagsTy {
|
|
uint64_t UseMemoryPool : 1;
|
|
uint64_t Reserved : 63;
|
|
L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
|
|
};
|
|
|
|
struct L0OptionsTy {
|
|
/// Binary flags.
|
|
L0OptionFlagsTy Flags;
|
|
|
|
/// Staging buffer size.
|
|
size_t StagingBufferSize = L0StagingBufferSize;
|
|
|
|
/// Staging buffer count.
|
|
size_t StagingBufferCount = L0StagingBufferCount;
|
|
|
|
struct MemPoolConfigTy {
|
|
bool Use;
|
|
int32_t AllocMax;
|
|
int32_t Capacity;
|
|
int32_t PoolSize;
|
|
};
|
|
/// Memory pool default parameters for each allocation kind:
|
|
/// {UseByDefault, AllocMax(MB), Capacity, PoolSize(MB)}
|
|
std::array<MemPoolConfigTy, 3> MemPoolConfig{
|
|
MemPoolConfigTy{true, 1, 4, 256}, // TARGET_ALLOC_DEVICE
|
|
MemPoolConfigTy{true, 1, 4, 256}, // TARGET_ALLOC_HOST
|
|
MemPoolConfigTy{true, 8, 4, 256}}; // TARGET_ALLOC_SHARED
|
|
|
|
/// Parameters for memory pools dedicated to reduction scratch space.
|
|
std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};
|
|
|
|
/// Oversubscription rate for normal kernels.
|
|
uint32_t SubscriptionRate = 4;
|
|
|
|
/// Loop kernels with known ND-range may be known to have
|
|
/// few iterations and they may not exploit the offload device
|
|
/// to the fullest extent.
|
|
/// Let's assume a device has N total HW threads available,
|
|
/// and the kernel requires M hardware threads with LWS set to L.
|
|
/// If (M < N * ThinThreadsThreshold), then we will try
|
|
/// to iteratively divide L by 2 to increase the number of HW
|
|
/// threads used for executing the kernel. Effectively, we will
|
|
/// end up with L less than the kernel's SIMD width, so the HW
|
|
/// threads will not use all their SIMD lanes. This (presumably) should
|
|
/// allow more parallelism, because the stalls in the SIMD lanes
|
|
/// will be distributed across more HW threads, and the probability
|
|
/// of having a stall (or a sequence of stalls) on a critical path
|
|
/// in the kernel should decrease.
|
|
/// Anyway, this is just a heuristics that seems to work well for some
|
|
/// kernels (which poorly expose parallelism in the first place).
|
|
double ThinThreadsThreshold = 0.1;
|
|
|
|
// Compilation options for IGC.
|
|
// OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
|
|
// runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
|
|
// option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
|
|
// builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
|
|
// builtins.
|
|
static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
|
|
static constexpr std::string_view InternalCompilationOptions =
|
|
"-cl-take-global-address";
|
|
std::string UserCompilationOptions;
|
|
|
|
/// Spec constants used for all modules.
|
|
SpecConstantsTy CommonSpecConstants;
|
|
|
|
/// Command execution mode.
|
|
/// Whether the runtime uses asynchronous mode or not depends on the type of
|
|
/// devices and whether immediate command list is fully enabled.
|
|
CommandModeTy CommandMode = CommandModeTy::Async;
|
|
|
|
/// Controls if we need to reduce available HW threads. We need this
|
|
/// adjustment on XeHPG when Level Zero debug is enabled
|
|
/// (ZET_ENABLE_PROGRAM_DEBUGGING=1).
|
|
bool ZeDebugEnabled = false;
|
|
|
|
bool Init = false; // Have the options already been processed.
|
|
|
|
// Allocator for long-lived allocations (e.g. spec constants).
|
|
BumpPtrAllocator Allocator;
|
|
|
|
L0OptionsTy() : CommonSpecConstants(Allocator) {}
|
|
|
|
/// Read environment variables.
|
|
void processEnvironmentVars();
|
|
|
|
void init() {
|
|
if (!Init) {
|
|
processEnvironmentVars();
|
|
Init = true;
|
|
}
|
|
}
|
|
|
|
bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
|
|
return Matched.equals_insensitive(Var.get());
|
|
}
|
|
|
|
}; // L0OptionsTy
|
|
|
|
} // namespace llvm::omp::target::plugin
|
|
|
|
#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
|