[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (#152831)
Part 3 adding offload runtime support. See https://github.com/llvm/llvm-project/pull/152651. --------- Co-authored-by: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
This commit is contained in:
committed by
GitHub
parent
11e0d6ae4b
commit
1f583c6dee
@@ -101,7 +101,7 @@ __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
|
|||||||
__OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
|
__OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
|
||||||
ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
|
ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
|
||||||
__OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false,
|
__OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false,
|
||||||
Int32, Int32)
|
VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
|
||||||
|
|
||||||
#undef __OMP_STRUCT_TYPE
|
#undef __OMP_STRUCT_TYPE
|
||||||
#undef OMP_STRUCT_TYPE
|
#undef OMP_STRUCT_TYPE
|
||||||
|
|||||||
@@ -102,8 +102,9 @@ struct KernelArgsTy {
|
|||||||
struct {
|
struct {
|
||||||
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
|
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
|
||||||
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
|
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
|
||||||
uint64_t Unused : 62;
|
uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
|
||||||
} Flags = {0, 0, 0};
|
uint64_t Unused : 60;
|
||||||
|
} Flags = {0, 0, 0, 0};
|
||||||
// The number of teams (for x,y,z dimension).
|
// The number of teams (for x,y,z dimension).
|
||||||
uint32_t NumTeams[3] = {0, 0, 0};
|
uint32_t NumTeams[3] = {0, 0, 0};
|
||||||
// The number of threads (for x,y,z dimension).
|
// The number of threads (for x,y,z dimension).
|
||||||
|
|||||||
@@ -70,10 +70,25 @@ struct KernelEnvironmentTy {
|
|||||||
DynamicEnvironmentTy *DynamicEnv = nullptr;
|
DynamicEnvironmentTy *DynamicEnv = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// The fallback types for the dynamic cgroup memory.
|
||||||
|
enum class DynCGroupMemFallbackType : uint8_t {
|
||||||
|
/// None. Used for indicating that no fallback was triggered.
|
||||||
|
None = 0,
|
||||||
|
/// Abort the execution.
|
||||||
|
Abort = None,
|
||||||
|
/// Return null pointer.
|
||||||
|
Null = 1,
|
||||||
|
/// Allocate from a implementation defined memory space.
|
||||||
|
DefaultMem = 2
|
||||||
|
};
|
||||||
|
|
||||||
struct KernelLaunchEnvironmentTy {
|
struct KernelLaunchEnvironmentTy {
|
||||||
|
void *ReductionBuffer = nullptr;
|
||||||
|
void *DynCGroupMemFbPtr = nullptr;
|
||||||
uint32_t ReductionCnt = 0;
|
uint32_t ReductionCnt = 0;
|
||||||
uint32_t ReductionIterCnt = 0;
|
uint32_t ReductionIterCnt = 0;
|
||||||
void *ReductionBuffer = nullptr;
|
uint32_t DynCGroupMemSize = 0;
|
||||||
|
DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // OMPTARGET_SHARED_ENVIRONMENT_H
|
#endif // OMPTARGET_SHARED_ENVIRONMENT_H
|
||||||
|
|||||||
@@ -37,6 +37,8 @@
|
|||||||
#include "PluginInterface.h"
|
#include "PluginInterface.h"
|
||||||
|
|
||||||
using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
|
using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
|
||||||
|
using DeviceInfo = llvm::omp::target::plugin::DeviceInfo;
|
||||||
|
using InfoTreeNode = llvm::omp::target::plugin::InfoTreeNode;
|
||||||
|
|
||||||
// Forward declarations.
|
// Forward declarations.
|
||||||
struct __tgt_bin_desc;
|
struct __tgt_bin_desc;
|
||||||
@@ -167,6 +169,20 @@ struct DeviceTy {
|
|||||||
/// Indicate that there are pending images for this device or not.
|
/// Indicate that there are pending images for this device or not.
|
||||||
void setHasPendingImages(bool V) { HasPendingImages = V; }
|
void setHasPendingImages(bool V) { HasPendingImages = V; }
|
||||||
|
|
||||||
|
/// Get information from the device.
|
||||||
|
template <typename T> T getInfo(DeviceInfo Info) const {
|
||||||
|
InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID);
|
||||||
|
|
||||||
|
auto EntryOpt = DevInfo.get(Info);
|
||||||
|
if (!EntryOpt)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
auto Entry = *EntryOpt;
|
||||||
|
if (!std::holds_alternative<T>(Entry->Value))
|
||||||
|
return T{};
|
||||||
|
return std::get<T>(Entry->Value);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Deinitialize the device (and plugin).
|
/// Deinitialize the device (and plugin).
|
||||||
void deinit();
|
void deinit();
|
||||||
|
|||||||
@@ -274,12 +274,23 @@ struct __tgt_target_non_contig {
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/// The OpenMP access group type. The criterion for grouping tasks using a
|
||||||
|
/// specific grouping property.
|
||||||
|
enum omp_access_t {
|
||||||
|
/// Groups the tasks based on the contention group to which they belong.
|
||||||
|
omp_access_cgroup = 0,
|
||||||
|
/// Groups the tasks based on the parallel region to which they bind.
|
||||||
|
omp_access_pteam = 1,
|
||||||
|
};
|
||||||
|
|
||||||
void ompx_dump_mapping_tables(void);
|
void ompx_dump_mapping_tables(void);
|
||||||
int omp_get_num_devices(void);
|
int omp_get_num_devices(void);
|
||||||
int omp_get_device_num(void);
|
int omp_get_device_num(void);
|
||||||
int omp_get_device_from_uid(const char *DeviceUid);
|
int omp_get_device_from_uid(const char *DeviceUid);
|
||||||
const char *omp_get_uid_from_device(int DeviceNum);
|
const char *omp_get_uid_from_device(int DeviceNum);
|
||||||
int omp_get_initial_device(void);
|
int omp_get_initial_device(void);
|
||||||
|
size_t omp_get_gprivate_limit(int DeviceNum,
|
||||||
|
omp_access_t AccessGroup = omp_access_cgroup);
|
||||||
void *omp_target_alloc(size_t Size, int DeviceNum);
|
void *omp_target_alloc(size_t Size, int DeviceNum);
|
||||||
void omp_target_free(void *DevicePtr, int DeviceNum);
|
void omp_target_free(void *DevicePtr, int DeviceNum);
|
||||||
int omp_target_is_present(const void *Ptr, int DeviceNum);
|
int omp_target_is_present(const void *Ptr, int DeviceNum);
|
||||||
|
|||||||
@@ -138,6 +138,22 @@ EXTERN int omp_get_initial_device(void) {
|
|||||||
return HostDevice;
|
return HostDevice;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EXTERN size_t omp_get_gprivate_limit(int DeviceNum, omp_access_t AccessGroup) {
|
||||||
|
TIMESCOPE();
|
||||||
|
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
|
||||||
|
if (DeviceNum == omp_get_initial_device())
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (AccessGroup != omp_access_cgroup)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
auto DeviceOrErr = PM->getDevice(DeviceNum);
|
||||||
|
if (!DeviceOrErr)
|
||||||
|
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
|
||||||
|
|
||||||
|
return DeviceOrErr->getInfo<uint64_t>(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
|
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
|
||||||
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
|
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
|
||||||
";size=" + std::to_string(Size));
|
";size=" + std::to_string(Size));
|
||||||
|
|||||||
@@ -43,6 +43,7 @@ VERS1.0 {
|
|||||||
omp_get_device_from_uid;
|
omp_get_device_from_uid;
|
||||||
omp_get_uid_from_device;
|
omp_get_uid_from_device;
|
||||||
omp_get_initial_device;
|
omp_get_initial_device;
|
||||||
|
omp_get_gprivate_limit;
|
||||||
omp_target_alloc;
|
omp_target_alloc;
|
||||||
omp_target_free;
|
omp_target_free;
|
||||||
omp_target_is_accessible;
|
omp_target_is_accessible;
|
||||||
|
|||||||
@@ -559,6 +559,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
|
|||||||
return Err;
|
return Err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Set the static block memory size required by the kernel.
|
||||||
|
StaticBlockMemSize = GroupSize;
|
||||||
|
|
||||||
// Make sure it is a kernel symbol.
|
// Make sure it is a kernel symbol.
|
||||||
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
|
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
|
||||||
return Plugin::error(ErrorCode::INVALID_BINARY,
|
return Plugin::error(ErrorCode::INVALID_BINARY,
|
||||||
@@ -582,8 +585,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
|
|||||||
|
|
||||||
/// Launch the AMDGPU kernel function.
|
/// Launch the AMDGPU kernel function.
|
||||||
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
||||||
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
|
||||||
KernelLaunchParamsTy LaunchParams,
|
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
||||||
|
|
||||||
/// Return maximum block size for maximum occupancy
|
/// Return maximum block size for maximum occupancy
|
||||||
@@ -3220,7 +3223,7 @@ private:
|
|||||||
KernelArgsTy KernelArgs = {};
|
KernelArgsTy KernelArgs = {};
|
||||||
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
|
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
|
||||||
if (auto Err = AMDGPUKernel.launchImpl(
|
if (auto Err = AMDGPUKernel.launchImpl(
|
||||||
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
|
*this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
|
||||||
KernelLaunchParamsTy{}, AsyncInfoWrapper))
|
KernelLaunchParamsTy{}, AsyncInfoWrapper))
|
||||||
return Err;
|
return Err;
|
||||||
|
|
||||||
@@ -3755,6 +3758,7 @@ private:
|
|||||||
|
|
||||||
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||||
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
||||||
|
uint32_t DynBlockMemSize,
|
||||||
KernelArgsTy &KernelArgs,
|
KernelArgsTy &KernelArgs,
|
||||||
KernelLaunchParamsTy LaunchParams,
|
KernelLaunchParamsTy LaunchParams,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||||
@@ -3767,13 +3771,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
|||||||
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
|
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
|
||||||
return Err;
|
return Err;
|
||||||
|
|
||||||
// Account for user requested dynamic shared memory.
|
|
||||||
uint32_t GroupSize = getGroupSize();
|
|
||||||
if (uint32_t MaxDynCGroupMem = std::max(
|
|
||||||
KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
|
|
||||||
GroupSize += MaxDynCGroupMem;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t StackSize;
|
uint64_t StackSize;
|
||||||
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
|
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
|
||||||
return Err;
|
return Err;
|
||||||
@@ -3825,9 +3822,17 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
|||||||
KernelArgs.DynCGroupMem);
|
KernelArgs.DynCGroupMem);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Increase to the requested dynamic memory size for the device if needed.
|
||||||
|
DynBlockMemSize =
|
||||||
|
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
|
||||||
|
|
||||||
|
// HSA requires the group segment size to include both static and dynamic.
|
||||||
|
uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
|
||||||
|
|
||||||
// Push the kernel launch into the stream.
|
// Push the kernel launch into the stream.
|
||||||
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
|
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
|
||||||
GroupSize, StackSize, ArgsMemoryManager);
|
TotalBlockMemSize, StackSize,
|
||||||
|
ArgsMemoryManager);
|
||||||
}
|
}
|
||||||
|
|
||||||
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
||||||
|
|||||||
@@ -301,6 +301,18 @@ private:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Configuration of dynamic block memory needed for launching a kernel.
|
||||||
|
struct DynBlockMemConfTy {
|
||||||
|
/// The size of the dynamic block memory buffer.
|
||||||
|
uint32_t Size = 0;
|
||||||
|
/// The size of dynamic shared memory natively provided by the device.
|
||||||
|
uint32_t NativeSize = 0;
|
||||||
|
/// The fallback that was triggered (if any).
|
||||||
|
DynCGroupMemFallbackType Fallback = DynCGroupMemFallbackType::None;
|
||||||
|
/// The fallback pointer if global memory was used as alternative.
|
||||||
|
void *FallbackPtr = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
/// Class wrapping a __tgt_device_image and its offload entry table on a
|
/// Class wrapping a __tgt_device_image and its offload entry table on a
|
||||||
/// specific device. This class is responsible for storing and managing
|
/// specific device. This class is responsible for storing and managing
|
||||||
/// the offload entries for an image on a device.
|
/// the offload entries for an image on a device.
|
||||||
@@ -363,7 +375,7 @@ struct GenericKernelTy {
|
|||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
|
||||||
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
|
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
|
||||||
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
||||||
KernelArgsTy &KernelArgs,
|
uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
|
||||||
KernelLaunchParamsTy LaunchParams,
|
KernelLaunchParamsTy LaunchParams,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
|
||||||
|
|
||||||
@@ -373,6 +385,9 @@ struct GenericKernelTy {
|
|||||||
/// Get the kernel name.
|
/// Get the kernel name.
|
||||||
const char *getName() const { return Name.c_str(); }
|
const char *getName() const { return Name.c_str(); }
|
||||||
|
|
||||||
|
/// Get the size of the static per-block memory consumed by the kernel.
|
||||||
|
uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
|
||||||
|
|
||||||
/// Get the kernel image.
|
/// Get the kernel image.
|
||||||
DeviceImageTy &getImage() const {
|
DeviceImageTy &getImage() const {
|
||||||
assert(ImagePtr && "Kernel is not initialized!");
|
assert(ImagePtr && "Kernel is not initialized!");
|
||||||
@@ -386,8 +401,10 @@ struct GenericKernelTy {
|
|||||||
|
|
||||||
/// Return a device pointer to a new kernel launch environment.
|
/// Return a device pointer to a new kernel launch environment.
|
||||||
Expected<KernelLaunchEnvironmentTy *>
|
Expected<KernelLaunchEnvironmentTy *>
|
||||||
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
|
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
|
||||||
AsyncInfoWrapperTy &AsyncInfo) const;
|
const KernelArgsTy &KernelArgs,
|
||||||
|
const DynBlockMemConfTy &DynBlockMemConf,
|
||||||
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
|
||||||
|
|
||||||
/// Indicate whether an execution mode is valid.
|
/// Indicate whether an execution mode is valid.
|
||||||
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
|
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
|
||||||
@@ -433,6 +450,12 @@ protected:
|
|||||||
uint32_t NumBlocks[3]) const;
|
uint32_t NumBlocks[3]) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
/// Prepare the block memory buffer requested for the kernel and execute the
|
||||||
|
/// specified fallback if necessary.
|
||||||
|
Expected<DynBlockMemConfTy> prepareBlockMemory(GenericDeviceTy &GenericDevice,
|
||||||
|
KernelArgsTy &KernelArgs,
|
||||||
|
uint32_t NumBlocks) const;
|
||||||
|
|
||||||
/// Prepare the arguments before launching the kernel.
|
/// Prepare the arguments before launching the kernel.
|
||||||
KernelLaunchParamsTy
|
KernelLaunchParamsTy
|
||||||
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
||||||
@@ -487,6 +510,9 @@ protected:
|
|||||||
/// The maximum number of threads which the kernel could leverage.
|
/// The maximum number of threads which the kernel could leverage.
|
||||||
uint32_t MaxNumThreads;
|
uint32_t MaxNumThreads;
|
||||||
|
|
||||||
|
/// The static memory sized per block.
|
||||||
|
uint32_t StaticBlockMemSize = 0;
|
||||||
|
|
||||||
/// The kernel environment, including execution flags.
|
/// The kernel environment, including execution flags.
|
||||||
KernelEnvironmentTy KernelEnvironment;
|
KernelEnvironmentTy KernelEnvironment;
|
||||||
|
|
||||||
@@ -1498,6 +1524,9 @@ public:
|
|||||||
/// Query the current state of an asynchronous queue.
|
/// Query the current state of an asynchronous queue.
|
||||||
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
|
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
|
||||||
|
|
||||||
|
/// Obtain information about the given device.
|
||||||
|
InfoTreeNode obtain_device_info(int32_t DeviceId);
|
||||||
|
|
||||||
/// Prints information about the given devices supported by the plugin.
|
/// Prints information about the given devices supported by the plugin.
|
||||||
void print_device_info(int32_t DeviceId);
|
void print_device_info(int32_t DeviceId);
|
||||||
|
|
||||||
|
|||||||
@@ -435,20 +435,21 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
|
|||||||
|
|
||||||
Expected<KernelLaunchEnvironmentTy *>
|
Expected<KernelLaunchEnvironmentTy *>
|
||||||
GenericKernelTy::getKernelLaunchEnvironment(
|
GenericKernelTy::getKernelLaunchEnvironment(
|
||||||
GenericDeviceTy &GenericDevice, uint32_t Version,
|
GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
|
||||||
|
const DynBlockMemConfTy &DynBlockMemConf,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||||
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
|
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
|
||||||
// environment. Older versions of the compiler do not generate a kernel
|
// environment. Older versions of the compiler do not generate a kernel
|
||||||
// launch environment.
|
// launch environment.
|
||||||
if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
|
if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
|
||||||
Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
|
KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
if (!KernelEnvironment.Configuration.ReductionDataSize ||
|
if ((!KernelEnvironment.Configuration.ReductionDataSize ||
|
||||||
!KernelEnvironment.Configuration.ReductionBufferLength)
|
!KernelEnvironment.Configuration.ReductionBufferLength) &&
|
||||||
|
KernelArgs.DynCGroupMem == 0)
|
||||||
return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
|
return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
|
||||||
|
|
||||||
// TODO: Check if the kernel needs a launch environment.
|
|
||||||
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
|
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
|
||||||
/*HostPtr=*/nullptr,
|
/*HostPtr=*/nullptr,
|
||||||
TargetAllocTy::TARGET_ALLOC_DEVICE);
|
TargetAllocTy::TARGET_ALLOC_DEVICE);
|
||||||
@@ -462,7 +463,14 @@ GenericKernelTy::getKernelLaunchEnvironment(
|
|||||||
/// async data transfer.
|
/// async data transfer.
|
||||||
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
|
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
|
||||||
LocalKLE = KernelLaunchEnvironment;
|
LocalKLE = KernelLaunchEnvironment;
|
||||||
{
|
|
||||||
|
LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size;
|
||||||
|
LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr;
|
||||||
|
LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback;
|
||||||
|
LocalKLE.ReductionBuffer = nullptr;
|
||||||
|
|
||||||
|
if (KernelEnvironment.Configuration.ReductionDataSize &&
|
||||||
|
KernelEnvironment.Configuration.ReductionBufferLength) {
|
||||||
auto AllocOrErr = GenericDevice.dataAlloc(
|
auto AllocOrErr = GenericDevice.dataAlloc(
|
||||||
KernelEnvironment.Configuration.ReductionDataSize *
|
KernelEnvironment.Configuration.ReductionDataSize *
|
||||||
KernelEnvironment.Configuration.ReductionBufferLength,
|
KernelEnvironment.Configuration.ReductionBufferLength,
|
||||||
@@ -508,14 +516,81 @@ Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
|||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Expected<DynBlockMemConfTy>
|
||||||
|
GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice,
|
||||||
|
KernelArgsTy &KernelArgs,
|
||||||
|
uint32_t NumBlocks) const {
|
||||||
|
uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
|
||||||
|
uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
|
||||||
|
uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
|
||||||
|
uint32_t DynNativeBlockMemSize = DynBlockMemSize;
|
||||||
|
void *DynFallbackPtr = nullptr;
|
||||||
|
|
||||||
|
// No enough block memory to cover the static one. Cannot run the kernel.
|
||||||
|
if (StaticBlockMemSize > MaxBlockMemSize)
|
||||||
|
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
|
||||||
|
"Static block memory size exceeds maximum");
|
||||||
|
// No enough block memory to cover dynamic one, and the fallback is aborting.
|
||||||
|
if (static_cast<DynCGroupMemFallbackType>(
|
||||||
|
KernelArgs.Flags.DynCGroupMemFallback) ==
|
||||||
|
DynCGroupMemFallbackType::Abort &&
|
||||||
|
TotalBlockMemSize > MaxBlockMemSize)
|
||||||
|
return Plugin::error(
|
||||||
|
ErrorCode::INVALID_ARGUMENT,
|
||||||
|
"Requested block memory size (static + dynamic) exceeds maximum");
|
||||||
|
|
||||||
|
DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None;
|
||||||
|
if (DynBlockMemSize && TotalBlockMemSize > MaxBlockMemSize) {
|
||||||
|
// Launch without native dynamic block memory.
|
||||||
|
DynNativeBlockMemSize = 0;
|
||||||
|
DynFallback = static_cast<DynCGroupMemFallbackType>(
|
||||||
|
KernelArgs.Flags.DynCGroupMemFallback);
|
||||||
|
if (DynFallback != DynCGroupMemFallbackType::DefaultMem) {
|
||||||
|
// Do not provide any memory as fallback.
|
||||||
|
DynBlockMemSize = 0;
|
||||||
|
} else {
|
||||||
|
// Get global memory as fallback.
|
||||||
|
auto AllocOrErr = GenericDevice.dataAlloc(
|
||||||
|
NumBlocks * DynBlockMemSize,
|
||||||
|
/*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
|
||||||
|
if (!AllocOrErr)
|
||||||
|
return AllocOrErr.takeError();
|
||||||
|
DynFallbackPtr = *AllocOrErr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return DynBlockMemConfTy{DynBlockMemSize, DynNativeBlockMemSize, DynFallback,
|
||||||
|
DynFallbackPtr};
|
||||||
|
}
|
||||||
|
|
||||||
Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
||||||
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
|
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||||
llvm::SmallVector<void *, 16> Args;
|
llvm::SmallVector<void *, 16> Args;
|
||||||
llvm::SmallVector<void *, 16> Ptrs;
|
llvm::SmallVector<void *, 16> Ptrs;
|
||||||
|
|
||||||
|
uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
|
||||||
|
KernelArgs.ThreadLimit[1],
|
||||||
|
KernelArgs.ThreadLimit[2]};
|
||||||
|
uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
|
||||||
|
KernelArgs.NumTeams[2]};
|
||||||
|
if (!isBareMode()) {
|
||||||
|
NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
|
||||||
|
NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
|
||||||
|
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto DynBlockMemConfOrErr =
|
||||||
|
prepareBlockMemory(GenericDevice, KernelArgs, NumBlocks[0]);
|
||||||
|
if (!DynBlockMemConfOrErr)
|
||||||
|
return DynBlockMemConfOrErr.takeError();
|
||||||
|
|
||||||
|
DynBlockMemConfTy &DynBlockMemConf = *DynBlockMemConfOrErr;
|
||||||
|
if (DynBlockMemConf.FallbackPtr)
|
||||||
|
AsyncInfoWrapper.freeAllocationAfterSynchronization(
|
||||||
|
DynBlockMemConf.FallbackPtr);
|
||||||
|
|
||||||
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
|
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
|
||||||
GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
|
GenericDevice, KernelArgs, DynBlockMemConf, AsyncInfoWrapper);
|
||||||
if (!KernelLaunchEnvOrErr)
|
if (!KernelLaunchEnvOrErr)
|
||||||
return KernelLaunchEnvOrErr.takeError();
|
return KernelLaunchEnvOrErr.takeError();
|
||||||
|
|
||||||
@@ -531,17 +606,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
|||||||
Args, Ptrs, *KernelLaunchEnvOrErr);
|
Args, Ptrs, *KernelLaunchEnvOrErr);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
|
|
||||||
KernelArgs.ThreadLimit[1],
|
|
||||||
KernelArgs.ThreadLimit[2]};
|
|
||||||
uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
|
|
||||||
KernelArgs.NumTeams[2]};
|
|
||||||
if (!isBareMode()) {
|
|
||||||
NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
|
|
||||||
NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
|
|
||||||
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Record the kernel description after we modified the argument count and num
|
// Record the kernel description after we modified the argument count and num
|
||||||
// blocks/threads.
|
// blocks/threads.
|
||||||
RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
|
RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
|
||||||
@@ -557,8 +621,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
|||||||
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
|
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
|
||||||
return Err;
|
return Err;
|
||||||
|
|
||||||
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
|
return launchImpl(GenericDevice, NumThreads, NumBlocks,
|
||||||
LaunchParams, AsyncInfoWrapper);
|
DynBlockMemConf.NativeSize, KernelArgs, LaunchParams,
|
||||||
|
AsyncInfoWrapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
|
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
|
||||||
@@ -1954,6 +2019,16 @@ int32_t GenericPluginTy::query_async(int32_t DeviceId,
|
|||||||
return OFFLOAD_SUCCESS;
|
return OFFLOAD_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
|
||||||
|
auto InfoOrErr = getDevice(DeviceId).obtainInfo();
|
||||||
|
if (auto Err = InfoOrErr.takeError()) {
|
||||||
|
REPORT() << "Failure to obtain device " << DeviceId
|
||||||
|
<< " info: " << toString(std::move(Err));
|
||||||
|
return InfoTreeNode{};
|
||||||
|
}
|
||||||
|
return std::move(*InfoOrErr);
|
||||||
|
}
|
||||||
|
|
||||||
void GenericPluginTy::print_device_info(int32_t DeviceId) {
|
void GenericPluginTy::print_device_info(int32_t DeviceId) {
|
||||||
if (auto Err = getDevice(DeviceId).printInfo())
|
if (auto Err = getDevice(DeviceId).printInfo())
|
||||||
REPORT() << "Failure to print device " << DeviceId
|
REPORT() << "Failure to print device " << DeviceId
|
||||||
|
|||||||
@@ -261,6 +261,7 @@ typedef enum CUdevice_attribute_enum {
|
|||||||
|
|
||||||
typedef enum CUfunction_attribute_enum {
|
typedef enum CUfunction_attribute_enum {
|
||||||
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
|
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
|
||||||
|
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
|
||||||
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
|
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
|
||||||
} CUfunction_attribute;
|
} CUfunction_attribute;
|
||||||
|
|
||||||
|
|||||||
@@ -150,14 +150,23 @@ struct CUDAKernelTy : public GenericKernelTy {
|
|||||||
// The maximum number of threads cannot exceed the maximum of the kernel.
|
// The maximum number of threads cannot exceed the maximum of the kernel.
|
||||||
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
|
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
|
||||||
|
|
||||||
|
int SharedMemSize;
|
||||||
|
Res = cuFuncGetAttribute(&SharedMemSize,
|
||||||
|
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func);
|
||||||
|
if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
|
||||||
|
return Err;
|
||||||
|
|
||||||
|
// Set the static block memory size required by the kernel.
|
||||||
|
StaticBlockMemSize = SharedMemSize;
|
||||||
|
|
||||||
// Retrieve the size of the arguments.
|
// Retrieve the size of the arguments.
|
||||||
return initArgsSize();
|
return initArgsSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Launch the CUDA kernel function.
|
/// Launch the CUDA kernel function.
|
||||||
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
||||||
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
|
||||||
KernelLaunchParamsTy LaunchParams,
|
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
||||||
|
|
||||||
/// Return maximum block size for maximum occupancy
|
/// Return maximum block size for maximum occupancy
|
||||||
@@ -197,7 +206,7 @@ private:
|
|||||||
CUfunction Func;
|
CUfunction Func;
|
||||||
/// The maximum amount of dynamic shared memory per thread group. By default,
|
/// The maximum amount of dynamic shared memory per thread group. By default,
|
||||||
/// this is set to 48 KB.
|
/// this is set to 48 KB.
|
||||||
mutable uint32_t MaxDynCGroupMemLimit = 49152;
|
mutable uint32_t MaxDynBlockMemSize = 49152;
|
||||||
|
|
||||||
/// The size of the kernel arguments.
|
/// The size of the kernel arguments.
|
||||||
size_t ArgsSize;
|
size_t ArgsSize;
|
||||||
@@ -1411,7 +1420,7 @@ private:
|
|||||||
KernelArgsTy KernelArgs = {};
|
KernelArgsTy KernelArgs = {};
|
||||||
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
|
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
|
||||||
if (auto Err = CUDAKernel.launchImpl(
|
if (auto Err = CUDAKernel.launchImpl(
|
||||||
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
|
*this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
|
||||||
KernelLaunchParamsTy{}, AsyncInfoWrapper))
|
KernelLaunchParamsTy{}, AsyncInfoWrapper))
|
||||||
return Err;
|
return Err;
|
||||||
|
|
||||||
@@ -1455,6 +1464,7 @@ private:
|
|||||||
|
|
||||||
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||||
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
||||||
|
uint32_t DynBlockMemSize,
|
||||||
KernelArgsTy &KernelArgs,
|
KernelArgsTy &KernelArgs,
|
||||||
KernelLaunchParamsTy LaunchParams,
|
KernelLaunchParamsTy LaunchParams,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||||
@@ -1470,9 +1480,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
|||||||
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
|
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
|
||||||
return Err;
|
return Err;
|
||||||
|
|
||||||
uint32_t MaxDynCGroupMem =
|
|
||||||
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
|
|
||||||
|
|
||||||
size_t ConfigArgsSize = ArgsSize;
|
size_t ConfigArgsSize = ArgsSize;
|
||||||
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
|
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
|
||||||
CU_LAUNCH_PARAM_BUFFER_SIZE,
|
CU_LAUNCH_PARAM_BUFFER_SIZE,
|
||||||
@@ -1484,20 +1491,24 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
|||||||
if (GenericDevice.getRPCServer())
|
if (GenericDevice.getRPCServer())
|
||||||
GenericDevice.Plugin.getRPCServer().Thread->notify();
|
GenericDevice.Plugin.getRPCServer().Thread->notify();
|
||||||
|
|
||||||
|
// Increase to the requested dynamic memory size for the device if needed.
|
||||||
|
DynBlockMemSize =
|
||||||
|
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
|
||||||
|
|
||||||
// In case we require more memory than the current limit.
|
// In case we require more memory than the current limit.
|
||||||
if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
|
if (DynBlockMemSize >= MaxDynBlockMemSize) {
|
||||||
CUresult AttrResult = cuFuncSetAttribute(
|
CUresult AttrResult = cuFuncSetAttribute(
|
||||||
Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
|
Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize);
|
||||||
if (auto Err = Plugin::check(
|
if (auto Err = Plugin::check(
|
||||||
AttrResult,
|
AttrResult,
|
||||||
"error in cuFuncSetAttribute while setting the memory limits: %s"))
|
"error in cuFuncSetAttribute while setting the memory limits: %s"))
|
||||||
return Err;
|
return Err;
|
||||||
MaxDynCGroupMemLimit = MaxDynCGroupMem;
|
MaxDynBlockMemSize = DynBlockMemSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
|
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
|
||||||
NumThreads[0], NumThreads[1], NumThreads[2],
|
NumThreads[0], NumThreads[1], NumThreads[2],
|
||||||
MaxDynCGroupMem, Stream, nullptr, Config);
|
DynBlockMemSize, Stream, nullptr, Config);
|
||||||
|
|
||||||
// Register a callback to indicate when the kernel is complete.
|
// Register a callback to indicate when the kernel is complete.
|
||||||
if (GenericDevice.getRPCServer())
|
if (GenericDevice.getRPCServer())
|
||||||
|
|||||||
@@ -97,8 +97,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
|
|||||||
|
|
||||||
/// Launch the kernel using the libffi.
|
/// Launch the kernel using the libffi.
|
||||||
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
||||||
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
|
||||||
KernelLaunchParamsTy LaunchParams,
|
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
|
||||||
if (!SupportsFFI)
|
if (!SupportsFFI)
|
||||||
return Plugin::error(ErrorCode::UNSUPPORTED,
|
return Plugin::error(ErrorCode::UNSUPPORTED,
|
||||||
|
|||||||
@@ -124,8 +124,8 @@ public:
|
|||||||
Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
|
Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
|
||||||
/// Launch the L0 kernel function.
|
/// Launch the L0 kernel function.
|
||||||
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
||||||
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
|
||||||
KernelLaunchParamsTy LaunchParams,
|
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
||||||
Error deinit() {
|
Error deinit() {
|
||||||
CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel);
|
CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel);
|
||||||
|
|||||||
@@ -413,9 +413,13 @@ Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
|
|||||||
|
|
||||||
Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||||
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
||||||
KernelArgsTy &KernelArgs,
|
uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
|
||||||
KernelLaunchParamsTy LaunchParams,
|
KernelLaunchParamsTy LaunchParams,
|
||||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||||
|
if (DynBlockMemSize > 0)
|
||||||
|
return Plugin::error(ErrorCode::UNSUPPORTED,
|
||||||
|
"dynamic shared memory is unsupported in L0 plugin");
|
||||||
|
|
||||||
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
|
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
|
||||||
__tgt_async_info *AsyncInfo = AsyncInfoWrapper;
|
__tgt_async_info *AsyncInfo = AsyncInfoWrapper;
|
||||||
|
|
||||||
|
|||||||
199
offload/test/offloading/dyn_groupprivate.cpp
Normal file
199
offload/test/offloading/dyn_groupprivate.cpp
Normal file
@@ -0,0 +1,199 @@
|
|||||||
|
// RUN: %libomptarget-compilexx-generic -fopenmp-version=61
|
||||||
|
// RUN: %libomptarget-run-generic | %fcheck-generic
|
||||||
|
// RUN: %libomptarget-compileoptxx-generic -fopenmp-version=61
|
||||||
|
// RUN: %libomptarget-run-generic | %fcheck-generic
|
||||||
|
// REQUIRES: gpu
|
||||||
|
// UNSUPPORTED: intelgpu
|
||||||
|
|
||||||
|
#include <omp.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#define N 512
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
int Result[N], NumThreads;
|
||||||
|
|
||||||
|
// Verify the groupprivate buffer works as expected.
|
||||||
|
#pragma omp target teams num_teams(1) thread_limit(N) \
|
||||||
|
dyn_groupprivate(fallback(abort) : N * sizeof(Result[0])) \
|
||||||
|
map(from : Result, NumThreads)
|
||||||
|
{
|
||||||
|
int Buffer[N];
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
int *DynBuffer = (int *)omp_get_dyn_gprivate_nofb_ptr();
|
||||||
|
int TId = omp_get_thread_num();
|
||||||
|
if (TId == 0)
|
||||||
|
NumThreads = omp_get_num_threads();
|
||||||
|
Buffer[TId] = 7;
|
||||||
|
DynBuffer[TId] = 3;
|
||||||
|
#pragma omp barrier
|
||||||
|
int WrappedTId = (TId + 37) % NumThreads;
|
||||||
|
Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (NumThreads < N / 2 || NumThreads > N) {
|
||||||
|
printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
|
||||||
|
NumThreads);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int Failed = 0;
|
||||||
|
for (int i = 0; i < NumThreads; ++i) {
|
||||||
|
if (Result[i] != 7 + 3) {
|
||||||
|
printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that the routines in the host returns NULL and zero.
|
||||||
|
if (omp_get_dyn_gprivate_ptr())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_nofb_ptr())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
|
||||||
|
size_t MaxSize = omp_get_gprivate_limit(0, omp_access_cgroup);
|
||||||
|
size_t ExceededSize = MaxSize + 10;
|
||||||
|
|
||||||
|
// Verify that the fallback(default_mem) modifier works.
|
||||||
|
#pragma omp target dyn_groupprivate(fallback(default_mem) : ExceededSize) \
|
||||||
|
map(tofrom : Failed)
|
||||||
|
{
|
||||||
|
if (!omp_get_dyn_gprivate_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (!omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size() != ExceededSize)
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that the fallback(null) modifier works.
|
||||||
|
#pragma omp target dyn_groupprivate(fallback(null) : ExceededSize) \
|
||||||
|
map(tofrom : Failed)
|
||||||
|
{
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that the default modifier is fallback(default_mem).
|
||||||
|
#pragma omp target dyn_groupprivate(ExceededSize)
|
||||||
|
{
|
||||||
|
if (!omp_get_dyn_gprivate_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (!omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size() != ExceededSize)
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that the fallback(abort) modifier works.
|
||||||
|
#pragma omp target dyn_groupprivate(fallback(abort) : N) map(tofrom : Failed)
|
||||||
|
{
|
||||||
|
if (!omp_get_dyn_gprivate_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (!omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(5) != omp_get_dyn_gprivate_nofb_ptr(5))
|
||||||
|
++Failed;
|
||||||
|
if (!omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size() != N)
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that the fallback(default_mem) does not trigger when not needed.
|
||||||
|
#pragma omp target dyn_groupprivate(fallback(default_mem) : N) \
|
||||||
|
map(tofrom : Failed)
|
||||||
|
{
|
||||||
|
if (!omp_get_dyn_gprivate_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (!omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (!omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size() != N)
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that the clause works when passing a zero size.
|
||||||
|
#pragma omp target dyn_groupprivate(fallback(abort) : 0) map(tofrom : Failed)
|
||||||
|
{
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that the clause works when passing a zero size and
|
||||||
|
// fallback(default_mem).
|
||||||
|
#pragma omp target dyn_groupprivate(fallback(default_mem) : 0) \
|
||||||
|
map(tofrom : Failed)
|
||||||
|
{
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that omitting the clause is the same as setting zero size.
|
||||||
|
#pragma omp target map(tofrom : Failed)
|
||||||
|
{
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_size())
|
||||||
|
++Failed;
|
||||||
|
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
|
||||||
|
++Failed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK: PASS
|
||||||
|
if (!Failed)
|
||||||
|
printf("PASS\n");
|
||||||
|
}
|
||||||
@@ -171,9 +171,29 @@ typedef enum omp_allocator_handle_t {
|
|||||||
KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
|
KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
|
||||||
} omp_allocator_handle_t;
|
} omp_allocator_handle_t;
|
||||||
|
|
||||||
|
typedef enum omp_memspace_handle_t {
|
||||||
|
omp_null_mem_space = 0,
|
||||||
|
omp_default_mem_space = 99,
|
||||||
|
omp_large_cap_mem_space = 1,
|
||||||
|
omp_const_mem_space = 2,
|
||||||
|
omp_high_bw_mem_space = 3,
|
||||||
|
omp_low_lat_mem_space = 4,
|
||||||
|
omp_cgroup_mem_space = 5,
|
||||||
|
KMP_MEMSPACE_MAX_HANDLE = ~(0LU)
|
||||||
|
} omp_memspace_handle_t;
|
||||||
|
|
||||||
#define __PRAGMA(STR) _Pragma(#STR)
|
#define __PRAGMA(STR) _Pragma(#STR)
|
||||||
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
|
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
|
||||||
|
|
||||||
///}
|
///}
|
||||||
|
|
||||||
|
/// The OpenMP access group type. The criterion for grupping tasks using a
|
||||||
|
/// specific grouping property.
|
||||||
|
enum omp_access_t {
|
||||||
|
/// Groups the tasks based on the contention group to which they belong.
|
||||||
|
omp_access_cgroup = 0,
|
||||||
|
/// Groups the tasks based on the parallel region to which they bind.
|
||||||
|
omp_access_pteam = 1,
|
||||||
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ struct KernelEnvironmentTy;
|
|||||||
int8_t __kmpc_is_spmd_exec_mode();
|
int8_t __kmpc_is_spmd_exec_mode();
|
||||||
|
|
||||||
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
|
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
|
||||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
|
KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
|
||||||
|
|
||||||
void __kmpc_target_deinit();
|
void __kmpc_target_deinit();
|
||||||
|
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
|
|||||||
|
|
||||||
/// Initialize the state machinery. Must be called by all threads.
|
/// Initialize the state machinery. Must be called by all threads.
|
||||||
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
|
KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
|
||||||
|
|
||||||
/// Return the kernel and kernel launch environment associated with the current
|
/// Return the kernel and kernel launch environment associated with the current
|
||||||
/// kernel. The former is static and contains compile time information that
|
/// kernel. The former is static and contains compile time information that
|
||||||
|
|||||||
@@ -35,8 +35,8 @@ enum OMPTgtExecModeFlags : unsigned char {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static void
|
static void
|
||||||
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
|
||||||
// Order is important here.
|
// Order is important here.
|
||||||
synchronize::init(IsSPMD);
|
synchronize::init(IsSPMD);
|
||||||
mapping::init(IsSPMD);
|
mapping::init(IsSPMD);
|
||||||
@@ -80,17 +80,17 @@ extern "C" {
|
|||||||
/// \param Ident Source location identification, can be NULL.
|
/// \param Ident Source location identification, can be NULL.
|
||||||
///
|
///
|
||||||
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
|
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
|
||||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
|
||||||
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
|
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
|
||||||
bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
|
bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
|
||||||
bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
|
bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
|
||||||
if (IsSPMD) {
|
if (IsSPMD) {
|
||||||
inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
|
initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
|
||||||
KernelLaunchEnvironment);
|
KernelLaunchEnvironment);
|
||||||
synchronize::threadsAligned(atomic::relaxed);
|
synchronize::threadsAligned(atomic::relaxed);
|
||||||
} else {
|
} else {
|
||||||
inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
|
initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
|
||||||
KernelLaunchEnvironment);
|
KernelLaunchEnvironment);
|
||||||
// No need to wait since only the main threads will execute user
|
// No need to wait since only the main threads will execute user
|
||||||
// code and workers will run into a barrier right away.
|
// code and workers will run into a barrier right away.
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,6 +40,10 @@ using namespace ompx;
|
|||||||
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
|
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
|
||||||
KernelLaunchEnvironmentPtr;
|
KernelLaunchEnvironmentPtr;
|
||||||
|
|
||||||
|
/// The pointer type for dynamic shared memory. This is important to keep
|
||||||
|
/// the alignment and address space information.
|
||||||
|
using SharedMemPtrTy = decltype(&DynamicSharedBuffer[0]);
|
||||||
|
|
||||||
///}
|
///}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
@@ -138,6 +142,60 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
|
|||||||
memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
|
memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Manager of the dynamic cgroup memory buffer.
|
||||||
|
struct DynCGroupMemTy {
|
||||||
|
/// Initialize the manager with the information from the kernel launch
|
||||||
|
/// enviornment and the pointer to the native shared memory buffer.
|
||||||
|
void init(KernelLaunchEnvironmentTy *KLE, SharedMemPtrTy NativePtr) {
|
||||||
|
// Initialize default values.
|
||||||
|
NativeOrNullPtr = nullptr;
|
||||||
|
FallbackPtr = nullptr;
|
||||||
|
Size = 0;
|
||||||
|
Fallback = DynCGroupMemFallbackType::None;
|
||||||
|
if (!KLE)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Initialize values using the kernel launch environment.
|
||||||
|
Size = KLE->DynCGroupMemSize;
|
||||||
|
Fallback = KLE->DynCGroupMemFb;
|
||||||
|
if (Size && Fallback == DynCGroupMemFallbackType::None)
|
||||||
|
NativeOrNullPtr = NativePtr;
|
||||||
|
if (Fallback == DynCGroupMemFallbackType::DefaultMem)
|
||||||
|
FallbackPtr = static_cast<unsigned char *>(KLE->DynCGroupMemFbPtr) +
|
||||||
|
Size * mapping::getBlockIdInKernel();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the memory space of the buffer.
|
||||||
|
omp_memspace_handle_t getMemSpace() const {
|
||||||
|
if (Size == 0)
|
||||||
|
return omp_null_mem_space;
|
||||||
|
if (Fallback == DynCGroupMemFallbackType::None)
|
||||||
|
return omp_cgroup_mem_space;
|
||||||
|
return omp_default_mem_space;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the size of the buffer.
|
||||||
|
size_t getSize() const { return Size; }
|
||||||
|
|
||||||
|
/// Get the native pointer or null if it was a fallback.
|
||||||
|
SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; }
|
||||||
|
|
||||||
|
/// Get the native pointer or the fallback pointer.
|
||||||
|
unsigned char *getNativeOrFallbackPtr() const {
|
||||||
|
return (Fallback == DynCGroupMemFallbackType::DefaultMem)
|
||||||
|
? FallbackPtr
|
||||||
|
: getNativeOrNullPtr();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
SharedMemPtrTy NativeOrNullPtr;
|
||||||
|
unsigned char *FallbackPtr;
|
||||||
|
size_t Size;
|
||||||
|
DynCGroupMemFallbackType Fallback;
|
||||||
|
};
|
||||||
|
|
||||||
|
[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
|
void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
|
||||||
@@ -226,13 +284,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
|
|||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
KernelLaunchEnvironmentTy *KLE) {
|
||||||
SharedMemorySmartStack.init(IsSPMD);
|
SharedMemorySmartStack.init(IsSPMD);
|
||||||
|
|
||||||
|
if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
|
||||||
|
KLE = nullptr;
|
||||||
|
|
||||||
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
|
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
|
||||||
|
DynCGroupMem.init(KLE, DynamicSharedBuffer);
|
||||||
TeamState.init(IsSPMD);
|
TeamState.init(IsSPMD);
|
||||||
ThreadStates = nullptr;
|
ThreadStates = nullptr;
|
||||||
KernelEnvironmentPtr = &KernelEnvironment;
|
KernelEnvironmentPtr = &KernelEnvironment;
|
||||||
KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
|
KernelLaunchEnvironmentPtr = KLE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -416,6 +479,25 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
|
|||||||
int omp_get_initial_device(void) { return -1; }
|
int omp_get_initial_device(void) { return -1; }
|
||||||
|
|
||||||
int omp_is_initial_device(void) { return 0; }
|
int omp_is_initial_device(void) { return 0; }
|
||||||
|
|
||||||
|
void *omp_get_dyn_gprivate_ptr(size_t Offset, omp_access_t) {
|
||||||
|
return DynCGroupMem.getNativeOrFallbackPtr() + Offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *omp_get_dyn_gprivate_nofb_ptr(size_t Offset, omp_access_t) {
|
||||||
|
unsigned char *Ptr = DynCGroupMem.getNativeOrNullPtr();
|
||||||
|
// Ensure the alignment and address space information is kept.
|
||||||
|
Ptr = (unsigned char *)__builtin_assume_aligned(Ptr, allocator::ALIGNMENT);
|
||||||
|
return (SharedMemPtrTy)(Ptr + Offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t omp_get_dyn_gprivate_size(omp_access_t) {
|
||||||
|
return DynCGroupMem.getSize();
|
||||||
|
}
|
||||||
|
|
||||||
|
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t) {
|
||||||
|
return DynCGroupMem.getMemSpace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|||||||
@@ -607,6 +607,7 @@ kmp_set_disp_num_buffers 890
|
|||||||
llvm_omp_target_shared_mem_space DATA
|
llvm_omp_target_shared_mem_space DATA
|
||||||
llvm_omp_target_device_mem_space DATA
|
llvm_omp_target_device_mem_space DATA
|
||||||
omp_null_mem_space DATA
|
omp_null_mem_space DATA
|
||||||
|
omp_cgroup_mem_space DATA
|
||||||
|
|
||||||
%ifndef stub
|
%ifndef stub
|
||||||
# Ordinals between 900 and 999 are reserved
|
# Ordinals between 900 and 999 are reserved
|
||||||
|
|||||||
@@ -380,6 +380,11 @@
|
|||||||
omp_uintptr_t value;
|
omp_uintptr_t value;
|
||||||
} omp_alloctrait_t;
|
} omp_alloctrait_t;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
omp_access_cgroup = 0,
|
||||||
|
omp_access_pteam = 1
|
||||||
|
} omp_access_t;
|
||||||
|
|
||||||
# if defined(_WIN32)
|
# if defined(_WIN32)
|
||||||
// On Windows cl and icl do not support 64-bit enum, let's use integer then.
|
// On Windows cl and icl do not support 64-bit enum, let's use integer then.
|
||||||
typedef omp_uintptr_t omp_allocator_handle_t;
|
typedef omp_uintptr_t omp_allocator_handle_t;
|
||||||
@@ -403,6 +408,7 @@
|
|||||||
extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
|
extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
|
||||||
extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
|
extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
|
||||||
extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
|
extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
|
||||||
|
extern __KMP_IMP omp_memspace_handle_t const omp_cgroup_mem_space;
|
||||||
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
|
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
|
||||||
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
|
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
|
||||||
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
|
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
|
||||||
@@ -439,6 +445,7 @@
|
|||||||
omp_const_mem_space = 2,
|
omp_const_mem_space = 2,
|
||||||
omp_high_bw_mem_space = 3,
|
omp_high_bw_mem_space = 3,
|
||||||
omp_low_lat_mem_space = 4,
|
omp_low_lat_mem_space = 4,
|
||||||
|
omp_cgroup_mem_space = 5,
|
||||||
llvm_omp_target_host_mem_space = 100,
|
llvm_omp_target_host_mem_space = 100,
|
||||||
llvm_omp_target_shared_mem_space = 101,
|
llvm_omp_target_shared_mem_space = 101,
|
||||||
llvm_omp_target_device_mem_space = 102,
|
llvm_omp_target_device_mem_space = 102,
|
||||||
@@ -463,6 +470,11 @@
|
|||||||
omp_allocator_handle_t allocator = omp_null_allocator,
|
omp_allocator_handle_t allocator = omp_null_allocator,
|
||||||
omp_allocator_handle_t free_allocator = omp_null_allocator);
|
omp_allocator_handle_t free_allocator = omp_null_allocator);
|
||||||
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
|
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
|
||||||
|
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
|
||||||
|
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
|
||||||
|
extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group = omp_access_cgroup);
|
||||||
|
extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group = omp_access_cgroup);
|
||||||
|
extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
|
||||||
# else
|
# else
|
||||||
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
|
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
|
||||||
extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
|
extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
|
||||||
@@ -473,6 +485,11 @@
|
|||||||
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
|
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
|
||||||
omp_allocator_handle_t free_allocator);
|
omp_allocator_handle_t free_allocator);
|
||||||
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
|
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
|
||||||
|
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group);
|
||||||
|
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group);
|
||||||
|
extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group);
|
||||||
|
extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group);
|
||||||
|
extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group);
|
||||||
# endif
|
# endif
|
||||||
|
|
||||||
/* OpenMP TR11 routines to get memory spaces and allocators */
|
/* OpenMP TR11 routines to get memory spaces and allocators */
|
||||||
|
|||||||
@@ -1072,6 +1072,7 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space;
|
|||||||
extern omp_memspace_handle_t const omp_const_mem_space;
|
extern omp_memspace_handle_t const omp_const_mem_space;
|
||||||
extern omp_memspace_handle_t const omp_high_bw_mem_space;
|
extern omp_memspace_handle_t const omp_high_bw_mem_space;
|
||||||
extern omp_memspace_handle_t const omp_low_lat_mem_space;
|
extern omp_memspace_handle_t const omp_low_lat_mem_space;
|
||||||
|
extern omp_memspace_handle_t const omp_cgroup_mem_space;
|
||||||
extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
|
extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
|
||||||
extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
|
extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
|
||||||
extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
|
extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
|
||||||
|
|||||||
@@ -4515,6 +4515,20 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
|
|||||||
}
|
}
|
||||||
/* end of OpenMP 5.1 Memory Management routines */
|
/* end of OpenMP 5.1 Memory Management routines */
|
||||||
|
|
||||||
|
void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { return 0; }
|
||||||
|
|
||||||
|
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
|
||||||
|
return omp_null_mem_space;
|
||||||
|
}
|
||||||
|
|
||||||
int __kmpc_get_target_offload(void) {
|
int __kmpc_get_target_offload(void) {
|
||||||
if (!__kmp_init_serial) {
|
if (!__kmp_init_serial) {
|
||||||
__kmp_serial_initialize();
|
__kmp_serial_initialize();
|
||||||
|
|||||||
@@ -333,6 +333,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
|
|||||||
(omp_memspace_handle_t const)3;
|
(omp_memspace_handle_t const)3;
|
||||||
omp_memspace_handle_t const omp_low_lat_mem_space =
|
omp_memspace_handle_t const omp_low_lat_mem_space =
|
||||||
(omp_memspace_handle_t const)4;
|
(omp_memspace_handle_t const)4;
|
||||||
|
omp_memspace_handle_t const omp_cgroup_mem_space =
|
||||||
|
(omp_memspace_handle_t const)5;
|
||||||
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
|
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
|
||||||
(omp_memspace_handle_t const)100;
|
(omp_memspace_handle_t const)100;
|
||||||
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
|
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
|
||||||
|
|||||||
@@ -368,6 +368,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
|
|||||||
(omp_memspace_handle_t const)3;
|
(omp_memspace_handle_t const)3;
|
||||||
omp_memspace_handle_t const omp_low_lat_mem_space =
|
omp_memspace_handle_t const omp_low_lat_mem_space =
|
||||||
(omp_memspace_handle_t const)4;
|
(omp_memspace_handle_t const)4;
|
||||||
|
omp_memspace_handle_t const omp_cgroup_mem_space =
|
||||||
|
(omp_memspace_handle_t const)5;
|
||||||
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
|
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
|
||||||
(omp_memspace_handle_t const)100;
|
(omp_memspace_handle_t const)100;
|
||||||
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
|
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
|
||||||
@@ -454,6 +456,31 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
|
||||||
|
i;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
|
||||||
|
i;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t omp_get_dyn_gprivate_size(omp_access_t access_group) {
|
||||||
|
i;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
|
||||||
|
i;
|
||||||
|
return omp_null_mem_space;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t omp_get_gprivate_limit(int device_num, omp_access_t access_group) {
|
||||||
|
i;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* OpenMP 5.0 Affinity Format */
|
/* OpenMP 5.0 Affinity Format */
|
||||||
void omp_set_affinity_format(char const *format) { i; }
|
void omp_set_affinity_format(char const *format) { i; }
|
||||||
size_t omp_get_affinity_format(char *buffer, size_t size) {
|
size_t omp_get_affinity_format(char *buffer, size_t size) {
|
||||||
|
|||||||
Reference in New Issue
Block a user