[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (#152831)
Part 3 adding offload runtime support. See https://github.com/llvm/llvm-project/pull/152651. --------- Co-authored-by: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
This commit is contained in:
committed by
GitHub
parent
11e0d6ae4b
commit
1f583c6dee
@@ -101,7 +101,7 @@ __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
|
||||
__OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
|
||||
ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
|
||||
__OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false,
|
||||
Int32, Int32)
|
||||
VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
|
||||
|
||||
#undef __OMP_STRUCT_TYPE
|
||||
#undef OMP_STRUCT_TYPE
|
||||
|
||||
@@ -102,8 +102,9 @@ struct KernelArgsTy {
|
||||
struct {
|
||||
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
|
||||
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
|
||||
uint64_t Unused : 62;
|
||||
} Flags = {0, 0, 0};
|
||||
uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
|
||||
uint64_t Unused : 60;
|
||||
} Flags = {0, 0, 0, 0};
|
||||
// The number of teams (for x,y,z dimension).
|
||||
uint32_t NumTeams[3] = {0, 0, 0};
|
||||
// The number of threads (for x,y,z dimension).
|
||||
|
||||
@@ -70,10 +70,25 @@ struct KernelEnvironmentTy {
|
||||
DynamicEnvironmentTy *DynamicEnv = nullptr;
|
||||
};
|
||||
|
||||
/// The fallback types for the dynamic cgroup memory.
|
||||
enum class DynCGroupMemFallbackType : uint8_t {
|
||||
/// None. Used for indicating that no fallback was triggered.
|
||||
None = 0,
|
||||
/// Abort the execution.
|
||||
Abort = None,
|
||||
/// Return null pointer.
|
||||
Null = 1,
|
||||
/// Allocate from a implementation defined memory space.
|
||||
DefaultMem = 2
|
||||
};
|
||||
|
||||
struct KernelLaunchEnvironmentTy {
|
||||
void *ReductionBuffer = nullptr;
|
||||
void *DynCGroupMemFbPtr = nullptr;
|
||||
uint32_t ReductionCnt = 0;
|
||||
uint32_t ReductionIterCnt = 0;
|
||||
void *ReductionBuffer = nullptr;
|
||||
uint32_t DynCGroupMemSize = 0;
|
||||
DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None;
|
||||
};
|
||||
|
||||
#endif // OMPTARGET_SHARED_ENVIRONMENT_H
|
||||
|
||||
@@ -37,6 +37,8 @@
|
||||
#include "PluginInterface.h"
|
||||
|
||||
using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
|
||||
using DeviceInfo = llvm::omp::target::plugin::DeviceInfo;
|
||||
using InfoTreeNode = llvm::omp::target::plugin::InfoTreeNode;
|
||||
|
||||
// Forward declarations.
|
||||
struct __tgt_bin_desc;
|
||||
@@ -167,6 +169,20 @@ struct DeviceTy {
|
||||
/// Indicate that there are pending images for this device or not.
|
||||
void setHasPendingImages(bool V) { HasPendingImages = V; }
|
||||
|
||||
/// Get information from the device.
|
||||
template <typename T> T getInfo(DeviceInfo Info) const {
|
||||
InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID);
|
||||
|
||||
auto EntryOpt = DevInfo.get(Info);
|
||||
if (!EntryOpt)
|
||||
return 0;
|
||||
|
||||
auto Entry = *EntryOpt;
|
||||
if (!std::holds_alternative<T>(Entry->Value))
|
||||
return T{};
|
||||
return std::get<T>(Entry->Value);
|
||||
}
|
||||
|
||||
private:
|
||||
/// Deinitialize the device (and plugin).
|
||||
void deinit();
|
||||
|
||||
@@ -274,12 +274,23 @@ struct __tgt_target_non_contig {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/// The OpenMP access group type. The criterion for grouping tasks using a
|
||||
/// specific grouping property.
|
||||
enum omp_access_t {
|
||||
/// Groups the tasks based on the contention group to which they belong.
|
||||
omp_access_cgroup = 0,
|
||||
/// Groups the tasks based on the parallel region to which they bind.
|
||||
omp_access_pteam = 1,
|
||||
};
|
||||
|
||||
void ompx_dump_mapping_tables(void);
|
||||
int omp_get_num_devices(void);
|
||||
int omp_get_device_num(void);
|
||||
int omp_get_device_from_uid(const char *DeviceUid);
|
||||
const char *omp_get_uid_from_device(int DeviceNum);
|
||||
int omp_get_initial_device(void);
|
||||
size_t omp_get_gprivate_limit(int DeviceNum,
|
||||
omp_access_t AccessGroup = omp_access_cgroup);
|
||||
void *omp_target_alloc(size_t Size, int DeviceNum);
|
||||
void omp_target_free(void *DevicePtr, int DeviceNum);
|
||||
int omp_target_is_present(const void *Ptr, int DeviceNum);
|
||||
|
||||
@@ -138,6 +138,22 @@ EXTERN int omp_get_initial_device(void) {
|
||||
return HostDevice;
|
||||
}
|
||||
|
||||
EXTERN size_t omp_get_gprivate_limit(int DeviceNum, omp_access_t AccessGroup) {
|
||||
TIMESCOPE();
|
||||
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
|
||||
if (DeviceNum == omp_get_initial_device())
|
||||
return 0;
|
||||
|
||||
if (AccessGroup != omp_access_cgroup)
|
||||
return 0;
|
||||
|
||||
auto DeviceOrErr = PM->getDevice(DeviceNum);
|
||||
if (!DeviceOrErr)
|
||||
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
|
||||
|
||||
return DeviceOrErr->getInfo<uint64_t>(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
|
||||
}
|
||||
|
||||
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
|
||||
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
|
||||
";size=" + std::to_string(Size));
|
||||
|
||||
@@ -43,6 +43,7 @@ VERS1.0 {
|
||||
omp_get_device_from_uid;
|
||||
omp_get_uid_from_device;
|
||||
omp_get_initial_device;
|
||||
omp_get_gprivate_limit;
|
||||
omp_target_alloc;
|
||||
omp_target_free;
|
||||
omp_target_is_accessible;
|
||||
|
||||
@@ -559,6 +559,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
|
||||
return Err;
|
||||
}
|
||||
|
||||
// Set the static block memory size required by the kernel.
|
||||
StaticBlockMemSize = GroupSize;
|
||||
|
||||
// Make sure it is a kernel symbol.
|
||||
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
|
||||
return Plugin::error(ErrorCode::INVALID_BINARY,
|
||||
@@ -582,8 +585,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
|
||||
|
||||
/// Launch the AMDGPU kernel function.
|
||||
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
||||
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
|
||||
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
||||
|
||||
/// Return maximum block size for maximum occupancy
|
||||
@@ -3220,7 +3223,7 @@ private:
|
||||
KernelArgsTy KernelArgs = {};
|
||||
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
|
||||
if (auto Err = AMDGPUKernel.launchImpl(
|
||||
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
|
||||
*this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
|
||||
KernelLaunchParamsTy{}, AsyncInfoWrapper))
|
||||
return Err;
|
||||
|
||||
@@ -3755,6 +3758,7 @@ private:
|
||||
|
||||
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
||||
uint32_t DynBlockMemSize,
|
||||
KernelArgsTy &KernelArgs,
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||
@@ -3767,13 +3771,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
|
||||
return Err;
|
||||
|
||||
// Account for user requested dynamic shared memory.
|
||||
uint32_t GroupSize = getGroupSize();
|
||||
if (uint32_t MaxDynCGroupMem = std::max(
|
||||
KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
|
||||
GroupSize += MaxDynCGroupMem;
|
||||
}
|
||||
|
||||
uint64_t StackSize;
|
||||
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
|
||||
return Err;
|
||||
@@ -3825,9 +3822,17 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
KernelArgs.DynCGroupMem);
|
||||
}
|
||||
|
||||
// Increase to the requested dynamic memory size for the device if needed.
|
||||
DynBlockMemSize =
|
||||
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
|
||||
|
||||
// HSA requires the group segment size to include both static and dynamic.
|
||||
uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
|
||||
|
||||
// Push the kernel launch into the stream.
|
||||
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
|
||||
GroupSize, StackSize, ArgsMemoryManager);
|
||||
TotalBlockMemSize, StackSize,
|
||||
ArgsMemoryManager);
|
||||
}
|
||||
|
||||
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
||||
|
||||
@@ -301,6 +301,18 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
/// Configuration of dynamic block memory needed for launching a kernel.
|
||||
struct DynBlockMemConfTy {
|
||||
/// The size of the dynamic block memory buffer.
|
||||
uint32_t Size = 0;
|
||||
/// The size of dynamic shared memory natively provided by the device.
|
||||
uint32_t NativeSize = 0;
|
||||
/// The fallback that was triggered (if any).
|
||||
DynCGroupMemFallbackType Fallback = DynCGroupMemFallbackType::None;
|
||||
/// The fallback pointer if global memory was used as alternative.
|
||||
void *FallbackPtr = nullptr;
|
||||
};
|
||||
|
||||
/// Class wrapping a __tgt_device_image and its offload entry table on a
|
||||
/// specific device. This class is responsible for storing and managing
|
||||
/// the offload entries for an image on a device.
|
||||
@@ -363,7 +375,7 @@ struct GenericKernelTy {
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
|
||||
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
|
||||
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
||||
KernelArgsTy &KernelArgs,
|
||||
uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
|
||||
|
||||
@@ -373,6 +385,9 @@ struct GenericKernelTy {
|
||||
/// Get the kernel name.
|
||||
const char *getName() const { return Name.c_str(); }
|
||||
|
||||
/// Get the size of the static per-block memory consumed by the kernel.
|
||||
uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
|
||||
|
||||
/// Get the kernel image.
|
||||
DeviceImageTy &getImage() const {
|
||||
assert(ImagePtr && "Kernel is not initialized!");
|
||||
@@ -386,8 +401,10 @@ struct GenericKernelTy {
|
||||
|
||||
/// Return a device pointer to a new kernel launch environment.
|
||||
Expected<KernelLaunchEnvironmentTy *>
|
||||
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
|
||||
AsyncInfoWrapperTy &AsyncInfo) const;
|
||||
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
|
||||
const KernelArgsTy &KernelArgs,
|
||||
const DynBlockMemConfTy &DynBlockMemConf,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
|
||||
|
||||
/// Indicate whether an execution mode is valid.
|
||||
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
|
||||
@@ -433,6 +450,12 @@ protected:
|
||||
uint32_t NumBlocks[3]) const;
|
||||
|
||||
private:
|
||||
/// Prepare the block memory buffer requested for the kernel and execute the
|
||||
/// specified fallback if necessary.
|
||||
Expected<DynBlockMemConfTy> prepareBlockMemory(GenericDeviceTy &GenericDevice,
|
||||
KernelArgsTy &KernelArgs,
|
||||
uint32_t NumBlocks) const;
|
||||
|
||||
/// Prepare the arguments before launching the kernel.
|
||||
KernelLaunchParamsTy
|
||||
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
||||
@@ -487,6 +510,9 @@ protected:
|
||||
/// The maximum number of threads which the kernel could leverage.
|
||||
uint32_t MaxNumThreads;
|
||||
|
||||
/// The static memory sized per block.
|
||||
uint32_t StaticBlockMemSize = 0;
|
||||
|
||||
/// The kernel environment, including execution flags.
|
||||
KernelEnvironmentTy KernelEnvironment;
|
||||
|
||||
@@ -1498,6 +1524,9 @@ public:
|
||||
/// Query the current state of an asynchronous queue.
|
||||
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
|
||||
|
||||
/// Obtain information about the given device.
|
||||
InfoTreeNode obtain_device_info(int32_t DeviceId);
|
||||
|
||||
/// Prints information about the given devices supported by the plugin.
|
||||
void print_device_info(int32_t DeviceId);
|
||||
|
||||
|
||||
@@ -435,20 +435,21 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
|
||||
|
||||
Expected<KernelLaunchEnvironmentTy *>
|
||||
GenericKernelTy::getKernelLaunchEnvironment(
|
||||
GenericDeviceTy &GenericDevice, uint32_t Version,
|
||||
GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
|
||||
const DynBlockMemConfTy &DynBlockMemConf,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
|
||||
// environment. Older versions of the compiler do not generate a kernel
|
||||
// launch environment.
|
||||
if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
|
||||
Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
|
||||
KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
|
||||
return nullptr;
|
||||
|
||||
if (!KernelEnvironment.Configuration.ReductionDataSize ||
|
||||
!KernelEnvironment.Configuration.ReductionBufferLength)
|
||||
if ((!KernelEnvironment.Configuration.ReductionDataSize ||
|
||||
!KernelEnvironment.Configuration.ReductionBufferLength) &&
|
||||
KernelArgs.DynCGroupMem == 0)
|
||||
return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
|
||||
|
||||
// TODO: Check if the kernel needs a launch environment.
|
||||
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
|
||||
/*HostPtr=*/nullptr,
|
||||
TargetAllocTy::TARGET_ALLOC_DEVICE);
|
||||
@@ -462,7 +463,14 @@ GenericKernelTy::getKernelLaunchEnvironment(
|
||||
/// async data transfer.
|
||||
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
|
||||
LocalKLE = KernelLaunchEnvironment;
|
||||
{
|
||||
|
||||
LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size;
|
||||
LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr;
|
||||
LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback;
|
||||
LocalKLE.ReductionBuffer = nullptr;
|
||||
|
||||
if (KernelEnvironment.Configuration.ReductionDataSize &&
|
||||
KernelEnvironment.Configuration.ReductionBufferLength) {
|
||||
auto AllocOrErr = GenericDevice.dataAlloc(
|
||||
KernelEnvironment.Configuration.ReductionDataSize *
|
||||
KernelEnvironment.Configuration.ReductionBufferLength,
|
||||
@@ -508,14 +516,81 @@ Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
Expected<DynBlockMemConfTy>
|
||||
GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice,
|
||||
KernelArgsTy &KernelArgs,
|
||||
uint32_t NumBlocks) const {
|
||||
uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
|
||||
uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
|
||||
uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
|
||||
uint32_t DynNativeBlockMemSize = DynBlockMemSize;
|
||||
void *DynFallbackPtr = nullptr;
|
||||
|
||||
// No enough block memory to cover the static one. Cannot run the kernel.
|
||||
if (StaticBlockMemSize > MaxBlockMemSize)
|
||||
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
|
||||
"Static block memory size exceeds maximum");
|
||||
// No enough block memory to cover dynamic one, and the fallback is aborting.
|
||||
if (static_cast<DynCGroupMemFallbackType>(
|
||||
KernelArgs.Flags.DynCGroupMemFallback) ==
|
||||
DynCGroupMemFallbackType::Abort &&
|
||||
TotalBlockMemSize > MaxBlockMemSize)
|
||||
return Plugin::error(
|
||||
ErrorCode::INVALID_ARGUMENT,
|
||||
"Requested block memory size (static + dynamic) exceeds maximum");
|
||||
|
||||
DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None;
|
||||
if (DynBlockMemSize && TotalBlockMemSize > MaxBlockMemSize) {
|
||||
// Launch without native dynamic block memory.
|
||||
DynNativeBlockMemSize = 0;
|
||||
DynFallback = static_cast<DynCGroupMemFallbackType>(
|
||||
KernelArgs.Flags.DynCGroupMemFallback);
|
||||
if (DynFallback != DynCGroupMemFallbackType::DefaultMem) {
|
||||
// Do not provide any memory as fallback.
|
||||
DynBlockMemSize = 0;
|
||||
} else {
|
||||
// Get global memory as fallback.
|
||||
auto AllocOrErr = GenericDevice.dataAlloc(
|
||||
NumBlocks * DynBlockMemSize,
|
||||
/*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
|
||||
if (!AllocOrErr)
|
||||
return AllocOrErr.takeError();
|
||||
DynFallbackPtr = *AllocOrErr;
|
||||
}
|
||||
}
|
||||
return DynBlockMemConfTy{DynBlockMemSize, DynNativeBlockMemSize, DynFallback,
|
||||
DynFallbackPtr};
|
||||
}
|
||||
|
||||
Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
||||
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||
llvm::SmallVector<void *, 16> Args;
|
||||
llvm::SmallVector<void *, 16> Ptrs;
|
||||
|
||||
uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
|
||||
KernelArgs.ThreadLimit[1],
|
||||
KernelArgs.ThreadLimit[2]};
|
||||
uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
|
||||
KernelArgs.NumTeams[2]};
|
||||
if (!isBareMode()) {
|
||||
NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
|
||||
NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
|
||||
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
|
||||
}
|
||||
|
||||
auto DynBlockMemConfOrErr =
|
||||
prepareBlockMemory(GenericDevice, KernelArgs, NumBlocks[0]);
|
||||
if (!DynBlockMemConfOrErr)
|
||||
return DynBlockMemConfOrErr.takeError();
|
||||
|
||||
DynBlockMemConfTy &DynBlockMemConf = *DynBlockMemConfOrErr;
|
||||
if (DynBlockMemConf.FallbackPtr)
|
||||
AsyncInfoWrapper.freeAllocationAfterSynchronization(
|
||||
DynBlockMemConf.FallbackPtr);
|
||||
|
||||
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
|
||||
GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
|
||||
GenericDevice, KernelArgs, DynBlockMemConf, AsyncInfoWrapper);
|
||||
if (!KernelLaunchEnvOrErr)
|
||||
return KernelLaunchEnvOrErr.takeError();
|
||||
|
||||
@@ -531,17 +606,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
||||
Args, Ptrs, *KernelLaunchEnvOrErr);
|
||||
}
|
||||
|
||||
uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
|
||||
KernelArgs.ThreadLimit[1],
|
||||
KernelArgs.ThreadLimit[2]};
|
||||
uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
|
||||
KernelArgs.NumTeams[2]};
|
||||
if (!isBareMode()) {
|
||||
NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
|
||||
NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
|
||||
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
|
||||
}
|
||||
|
||||
// Record the kernel description after we modified the argument count and num
|
||||
// blocks/threads.
|
||||
RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
|
||||
@@ -557,8 +621,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
|
||||
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
|
||||
return Err;
|
||||
|
||||
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
|
||||
LaunchParams, AsyncInfoWrapper);
|
||||
return launchImpl(GenericDevice, NumThreads, NumBlocks,
|
||||
DynBlockMemConf.NativeSize, KernelArgs, LaunchParams,
|
||||
AsyncInfoWrapper);
|
||||
}
|
||||
|
||||
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
|
||||
@@ -1954,6 +2019,16 @@ int32_t GenericPluginTy::query_async(int32_t DeviceId,
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
|
||||
auto InfoOrErr = getDevice(DeviceId).obtainInfo();
|
||||
if (auto Err = InfoOrErr.takeError()) {
|
||||
REPORT() << "Failure to obtain device " << DeviceId
|
||||
<< " info: " << toString(std::move(Err));
|
||||
return InfoTreeNode{};
|
||||
}
|
||||
return std::move(*InfoOrErr);
|
||||
}
|
||||
|
||||
void GenericPluginTy::print_device_info(int32_t DeviceId) {
|
||||
if (auto Err = getDevice(DeviceId).printInfo())
|
||||
REPORT() << "Failure to print device " << DeviceId
|
||||
|
||||
@@ -261,6 +261,7 @@ typedef enum CUdevice_attribute_enum {
|
||||
|
||||
typedef enum CUfunction_attribute_enum {
|
||||
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
|
||||
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
|
||||
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
|
||||
} CUfunction_attribute;
|
||||
|
||||
|
||||
@@ -150,14 +150,23 @@ struct CUDAKernelTy : public GenericKernelTy {
|
||||
// The maximum number of threads cannot exceed the maximum of the kernel.
|
||||
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
|
||||
|
||||
int SharedMemSize;
|
||||
Res = cuFuncGetAttribute(&SharedMemSize,
|
||||
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
|
||||
return Err;
|
||||
|
||||
// Set the static block memory size required by the kernel.
|
||||
StaticBlockMemSize = SharedMemSize;
|
||||
|
||||
// Retrieve the size of the arguments.
|
||||
return initArgsSize();
|
||||
}
|
||||
|
||||
/// Launch the CUDA kernel function.
|
||||
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
||||
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
|
||||
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
||||
|
||||
/// Return maximum block size for maximum occupancy
|
||||
@@ -197,7 +206,7 @@ private:
|
||||
CUfunction Func;
|
||||
/// The maximum amount of dynamic shared memory per thread group. By default,
|
||||
/// this is set to 48 KB.
|
||||
mutable uint32_t MaxDynCGroupMemLimit = 49152;
|
||||
mutable uint32_t MaxDynBlockMemSize = 49152;
|
||||
|
||||
/// The size of the kernel arguments.
|
||||
size_t ArgsSize;
|
||||
@@ -1411,7 +1420,7 @@ private:
|
||||
KernelArgsTy KernelArgs = {};
|
||||
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
|
||||
if (auto Err = CUDAKernel.launchImpl(
|
||||
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
|
||||
*this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
|
||||
KernelLaunchParamsTy{}, AsyncInfoWrapper))
|
||||
return Err;
|
||||
|
||||
@@ -1455,6 +1464,7 @@ private:
|
||||
|
||||
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
||||
uint32_t DynBlockMemSize,
|
||||
KernelArgsTy &KernelArgs,
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||
@@ -1470,9 +1480,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
|
||||
return Err;
|
||||
|
||||
uint32_t MaxDynCGroupMem =
|
||||
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
|
||||
|
||||
size_t ConfigArgsSize = ArgsSize;
|
||||
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
|
||||
CU_LAUNCH_PARAM_BUFFER_SIZE,
|
||||
@@ -1484,20 +1491,24 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
if (GenericDevice.getRPCServer())
|
||||
GenericDevice.Plugin.getRPCServer().Thread->notify();
|
||||
|
||||
// Increase to the requested dynamic memory size for the device if needed.
|
||||
DynBlockMemSize =
|
||||
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
|
||||
|
||||
// In case we require more memory than the current limit.
|
||||
if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
|
||||
if (DynBlockMemSize >= MaxDynBlockMemSize) {
|
||||
CUresult AttrResult = cuFuncSetAttribute(
|
||||
Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
|
||||
Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize);
|
||||
if (auto Err = Plugin::check(
|
||||
AttrResult,
|
||||
"error in cuFuncSetAttribute while setting the memory limits: %s"))
|
||||
return Err;
|
||||
MaxDynCGroupMemLimit = MaxDynCGroupMem;
|
||||
MaxDynBlockMemSize = DynBlockMemSize;
|
||||
}
|
||||
|
||||
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
|
||||
NumThreads[0], NumThreads[1], NumThreads[2],
|
||||
MaxDynCGroupMem, Stream, nullptr, Config);
|
||||
DynBlockMemSize, Stream, nullptr, Config);
|
||||
|
||||
// Register a callback to indicate when the kernel is complete.
|
||||
if (GenericDevice.getRPCServer())
|
||||
|
||||
@@ -97,8 +97,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
|
||||
|
||||
/// Launch the kernel using the libffi.
|
||||
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
||||
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
|
||||
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
|
||||
if (!SupportsFFI)
|
||||
return Plugin::error(ErrorCode::UNSUPPORTED,
|
||||
|
||||
@@ -124,8 +124,8 @@ public:
|
||||
Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
|
||||
/// Launch the L0 kernel function.
|
||||
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
||||
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
|
||||
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
||||
Error deinit() {
|
||||
CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel);
|
||||
|
||||
@@ -413,9 +413,13 @@ Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
|
||||
|
||||
Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
||||
KernelArgsTy &KernelArgs,
|
||||
uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||
if (DynBlockMemSize > 0)
|
||||
return Plugin::error(ErrorCode::UNSUPPORTED,
|
||||
"dynamic shared memory is unsupported in L0 plugin");
|
||||
|
||||
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
|
||||
__tgt_async_info *AsyncInfo = AsyncInfoWrapper;
|
||||
|
||||
|
||||
199
offload/test/offloading/dyn_groupprivate.cpp
Normal file
199
offload/test/offloading/dyn_groupprivate.cpp
Normal file
@@ -0,0 +1,199 @@
|
||||
// RUN: %libomptarget-compilexx-generic -fopenmp-version=61
|
||||
// RUN: %libomptarget-run-generic | %fcheck-generic
|
||||
// RUN: %libomptarget-compileoptxx-generic -fopenmp-version=61
|
||||
// RUN: %libomptarget-run-generic | %fcheck-generic
|
||||
// REQUIRES: gpu
|
||||
// UNSUPPORTED: intelgpu
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define N 512
|
||||
|
||||
int main() {
|
||||
int Result[N], NumThreads;
|
||||
|
||||
// Verify the groupprivate buffer works as expected.
|
||||
#pragma omp target teams num_teams(1) thread_limit(N) \
|
||||
dyn_groupprivate(fallback(abort) : N * sizeof(Result[0])) \
|
||||
map(from : Result, NumThreads)
|
||||
{
|
||||
int Buffer[N];
|
||||
#pragma omp parallel
|
||||
{
|
||||
int *DynBuffer = (int *)omp_get_dyn_gprivate_nofb_ptr();
|
||||
int TId = omp_get_thread_num();
|
||||
if (TId == 0)
|
||||
NumThreads = omp_get_num_threads();
|
||||
Buffer[TId] = 7;
|
||||
DynBuffer[TId] = 3;
|
||||
#pragma omp barrier
|
||||
int WrappedTId = (TId + 37) % NumThreads;
|
||||
Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
|
||||
}
|
||||
}
|
||||
|
||||
if (NumThreads < N / 2 || NumThreads > N) {
|
||||
printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
|
||||
NumThreads);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int Failed = 0;
|
||||
for (int i = 0; i < NumThreads; ++i) {
|
||||
if (Result[i] != 7 + 3) {
|
||||
printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
|
||||
++Failed;
|
||||
}
|
||||
}
|
||||
|
||||
// Verify that the routines in the host returns NULL and zero.
|
||||
if (omp_get_dyn_gprivate_ptr())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_nofb_ptr())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
|
||||
size_t MaxSize = omp_get_gprivate_limit(0, omp_access_cgroup);
|
||||
size_t ExceededSize = MaxSize + 10;
|
||||
|
||||
// Verify that the fallback(default_mem) modifier works.
|
||||
#pragma omp target dyn_groupprivate(fallback(default_mem) : ExceededSize) \
|
||||
map(tofrom : Failed)
|
||||
{
|
||||
if (!omp_get_dyn_gprivate_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (!omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size() != ExceededSize)
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
|
||||
++Failed;
|
||||
}
|
||||
|
||||
// Verify that the fallback(null) modifier works.
|
||||
#pragma omp target dyn_groupprivate(fallback(null) : ExceededSize) \
|
||||
map(tofrom : Failed)
|
||||
{
|
||||
if (omp_get_dyn_gprivate_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
|
||||
++Failed;
|
||||
}
|
||||
|
||||
// Verify that the default modifier is fallback(default_mem).
|
||||
#pragma omp target dyn_groupprivate(ExceededSize)
|
||||
{
|
||||
if (!omp_get_dyn_gprivate_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (!omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size() != ExceededSize)
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
|
||||
++Failed;
|
||||
}
|
||||
|
||||
// Verify that the fallback(abort) modifier works.
|
||||
#pragma omp target dyn_groupprivate(fallback(abort) : N) map(tofrom : Failed)
|
||||
{
|
||||
if (!omp_get_dyn_gprivate_ptr(0))
|
||||
++Failed;
|
||||
if (!omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(5) != omp_get_dyn_gprivate_nofb_ptr(5))
|
||||
++Failed;
|
||||
if (!omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size() != N)
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
|
||||
++Failed;
|
||||
}
|
||||
|
||||
// Verify that the fallback(default_mem) does not trigger when not needed.
|
||||
#pragma omp target dyn_groupprivate(fallback(default_mem) : N) \
|
||||
map(tofrom : Failed)
|
||||
{
|
||||
if (!omp_get_dyn_gprivate_ptr(0))
|
||||
++Failed;
|
||||
if (!omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (!omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size() != N)
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
|
||||
++Failed;
|
||||
}
|
||||
|
||||
// Verify that the clause works when passing a zero size.
|
||||
#pragma omp target dyn_groupprivate(fallback(abort) : 0) map(tofrom : Failed)
|
||||
{
|
||||
if (omp_get_dyn_gprivate_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
|
||||
++Failed;
|
||||
}
|
||||
|
||||
// Verify that the clause works when passing a zero size and
|
||||
// fallback(default_mem).
|
||||
#pragma omp target dyn_groupprivate(fallback(default_mem) : 0) \
|
||||
map(tofrom : Failed)
|
||||
{
|
||||
if (omp_get_dyn_gprivate_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
|
||||
++Failed;
|
||||
}
|
||||
|
||||
// Verify that omitting the clause is the same as setting zero size.
|
||||
#pragma omp target map(tofrom : Failed)
|
||||
{
|
||||
if (omp_get_dyn_gprivate_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_size())
|
||||
++Failed;
|
||||
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
|
||||
++Failed;
|
||||
}
|
||||
|
||||
// CHECK: PASS
|
||||
if (!Failed)
|
||||
printf("PASS\n");
|
||||
}
|
||||
@@ -171,9 +171,29 @@ typedef enum omp_allocator_handle_t {
|
||||
KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
|
||||
} omp_allocator_handle_t;
|
||||
|
||||
typedef enum omp_memspace_handle_t {
|
||||
omp_null_mem_space = 0,
|
||||
omp_default_mem_space = 99,
|
||||
omp_large_cap_mem_space = 1,
|
||||
omp_const_mem_space = 2,
|
||||
omp_high_bw_mem_space = 3,
|
||||
omp_low_lat_mem_space = 4,
|
||||
omp_cgroup_mem_space = 5,
|
||||
KMP_MEMSPACE_MAX_HANDLE = ~(0LU)
|
||||
} omp_memspace_handle_t;
|
||||
|
||||
#define __PRAGMA(STR) _Pragma(#STR)
|
||||
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
|
||||
|
||||
///}
|
||||
|
||||
/// The OpenMP access group type. The criterion for grupping tasks using a
|
||||
/// specific grouping property.
|
||||
enum omp_access_t {
|
||||
/// Groups the tasks based on the contention group to which they belong.
|
||||
omp_access_cgroup = 0,
|
||||
/// Groups the tasks based on the parallel region to which they bind.
|
||||
omp_access_pteam = 1,
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -226,7 +226,7 @@ struct KernelEnvironmentTy;
|
||||
int8_t __kmpc_is_spmd_exec_mode();
|
||||
|
||||
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
|
||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
|
||||
KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
|
||||
|
||||
void __kmpc_target_deinit();
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
|
||||
|
||||
/// Initialize the state machinery. Must be called by all threads.
|
||||
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
|
||||
KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
|
||||
|
||||
/// Return the kernel and kernel launch environment associated with the current
|
||||
/// kernel. The former is static and contains compile time information that
|
||||
|
||||
@@ -35,8 +35,8 @@ enum OMPTgtExecModeFlags : unsigned char {
|
||||
};
|
||||
|
||||
static void
|
||||
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
||||
initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
|
||||
// Order is important here.
|
||||
synchronize::init(IsSPMD);
|
||||
mapping::init(IsSPMD);
|
||||
@@ -80,16 +80,16 @@ extern "C" {
|
||||
/// \param Ident Source location identification, can be NULL.
|
||||
///
|
||||
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
|
||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
||||
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
|
||||
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
|
||||
bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
|
||||
bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
|
||||
if (IsSPMD) {
|
||||
inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
|
||||
initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
|
||||
KernelLaunchEnvironment);
|
||||
synchronize::threadsAligned(atomic::relaxed);
|
||||
} else {
|
||||
inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
|
||||
initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
|
||||
KernelLaunchEnvironment);
|
||||
// No need to wait since only the main threads will execute user
|
||||
// code and workers will run into a barrier right away.
|
||||
|
||||
@@ -40,6 +40,10 @@ using namespace ompx;
|
||||
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
|
||||
KernelLaunchEnvironmentPtr;
|
||||
|
||||
/// The pointer type for dynamic shared memory. This is important to keep
|
||||
/// the alignment and address space information.
|
||||
using SharedMemPtrTy = decltype(&DynamicSharedBuffer[0]);
|
||||
|
||||
///}
|
||||
|
||||
namespace {
|
||||
@@ -138,6 +142,60 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
|
||||
memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
|
||||
}
|
||||
|
||||
/// Manager of the dynamic cgroup memory buffer.
|
||||
struct DynCGroupMemTy {
|
||||
/// Initialize the manager with the information from the kernel launch
|
||||
/// enviornment and the pointer to the native shared memory buffer.
|
||||
void init(KernelLaunchEnvironmentTy *KLE, SharedMemPtrTy NativePtr) {
|
||||
// Initialize default values.
|
||||
NativeOrNullPtr = nullptr;
|
||||
FallbackPtr = nullptr;
|
||||
Size = 0;
|
||||
Fallback = DynCGroupMemFallbackType::None;
|
||||
if (!KLE)
|
||||
return;
|
||||
|
||||
// Initialize values using the kernel launch environment.
|
||||
Size = KLE->DynCGroupMemSize;
|
||||
Fallback = KLE->DynCGroupMemFb;
|
||||
if (Size && Fallback == DynCGroupMemFallbackType::None)
|
||||
NativeOrNullPtr = NativePtr;
|
||||
if (Fallback == DynCGroupMemFallbackType::DefaultMem)
|
||||
FallbackPtr = static_cast<unsigned char *>(KLE->DynCGroupMemFbPtr) +
|
||||
Size * mapping::getBlockIdInKernel();
|
||||
}
|
||||
|
||||
/// Get the memory space of the buffer.
|
||||
omp_memspace_handle_t getMemSpace() const {
|
||||
if (Size == 0)
|
||||
return omp_null_mem_space;
|
||||
if (Fallback == DynCGroupMemFallbackType::None)
|
||||
return omp_cgroup_mem_space;
|
||||
return omp_default_mem_space;
|
||||
}
|
||||
|
||||
/// Get the size of the buffer.
|
||||
size_t getSize() const { return Size; }
|
||||
|
||||
/// Get the native pointer or null if it was a fallback.
|
||||
SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; }
|
||||
|
||||
/// Get the native pointer or the fallback pointer.
|
||||
unsigned char *getNativeOrFallbackPtr() const {
|
||||
return (Fallback == DynCGroupMemFallbackType::DefaultMem)
|
||||
? FallbackPtr
|
||||
: getNativeOrNullPtr();
|
||||
}
|
||||
|
||||
private:
|
||||
SharedMemPtrTy NativeOrNullPtr;
|
||||
unsigned char *FallbackPtr;
|
||||
size_t Size;
|
||||
DynCGroupMemFallbackType Fallback;
|
||||
};
|
||||
|
||||
[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
|
||||
|
||||
} // namespace
|
||||
|
||||
void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
|
||||
@@ -226,13 +284,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
|
||||
} // namespace
|
||||
|
||||
void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
||||
KernelLaunchEnvironmentTy *KLE) {
|
||||
SharedMemorySmartStack.init(IsSPMD);
|
||||
|
||||
if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
|
||||
KLE = nullptr;
|
||||
|
||||
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
|
||||
DynCGroupMem.init(KLE, DynamicSharedBuffer);
|
||||
TeamState.init(IsSPMD);
|
||||
ThreadStates = nullptr;
|
||||
KernelEnvironmentPtr = &KernelEnvironment;
|
||||
KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
|
||||
KernelLaunchEnvironmentPtr = KLE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -416,6 +479,25 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
|
||||
int omp_get_initial_device(void) { return -1; }
|
||||
|
||||
int omp_is_initial_device(void) { return 0; }
|
||||
|
||||
void *omp_get_dyn_gprivate_ptr(size_t Offset, omp_access_t) {
|
||||
return DynCGroupMem.getNativeOrFallbackPtr() + Offset;
|
||||
}
|
||||
|
||||
void *omp_get_dyn_gprivate_nofb_ptr(size_t Offset, omp_access_t) {
|
||||
unsigned char *Ptr = DynCGroupMem.getNativeOrNullPtr();
|
||||
// Ensure the alignment and address space information is kept.
|
||||
Ptr = (unsigned char *)__builtin_assume_aligned(Ptr, allocator::ALIGNMENT);
|
||||
return (SharedMemPtrTy)(Ptr + Offset);
|
||||
}
|
||||
|
||||
size_t omp_get_dyn_gprivate_size(omp_access_t) {
|
||||
return DynCGroupMem.getSize();
|
||||
}
|
||||
|
||||
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t) {
|
||||
return DynCGroupMem.getMemSpace();
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
@@ -607,6 +607,7 @@ kmp_set_disp_num_buffers 890
|
||||
llvm_omp_target_shared_mem_space DATA
|
||||
llvm_omp_target_device_mem_space DATA
|
||||
omp_null_mem_space DATA
|
||||
omp_cgroup_mem_space DATA
|
||||
|
||||
%ifndef stub
|
||||
# Ordinals between 900 and 999 are reserved
|
||||
|
||||
@@ -380,6 +380,11 @@
|
||||
omp_uintptr_t value;
|
||||
} omp_alloctrait_t;
|
||||
|
||||
typedef enum {
|
||||
omp_access_cgroup = 0,
|
||||
omp_access_pteam = 1
|
||||
} omp_access_t;
|
||||
|
||||
# if defined(_WIN32)
|
||||
// On Windows cl and icl do not support 64-bit enum, let's use integer then.
|
||||
typedef omp_uintptr_t omp_allocator_handle_t;
|
||||
@@ -403,6 +408,7 @@
|
||||
extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
|
||||
extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
|
||||
extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
|
||||
extern __KMP_IMP omp_memspace_handle_t const omp_cgroup_mem_space;
|
||||
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
|
||||
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
|
||||
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
|
||||
@@ -439,6 +445,7 @@
|
||||
omp_const_mem_space = 2,
|
||||
omp_high_bw_mem_space = 3,
|
||||
omp_low_lat_mem_space = 4,
|
||||
omp_cgroup_mem_space = 5,
|
||||
llvm_omp_target_host_mem_space = 100,
|
||||
llvm_omp_target_shared_mem_space = 101,
|
||||
llvm_omp_target_device_mem_space = 102,
|
||||
@@ -463,6 +470,11 @@
|
||||
omp_allocator_handle_t allocator = omp_null_allocator,
|
||||
omp_allocator_handle_t free_allocator = omp_null_allocator);
|
||||
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
|
||||
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
|
||||
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
|
||||
extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group = omp_access_cgroup);
|
||||
extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group = omp_access_cgroup);
|
||||
extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
|
||||
# else
|
||||
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
|
||||
extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
|
||||
@@ -473,6 +485,11 @@
|
||||
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
|
||||
omp_allocator_handle_t free_allocator);
|
||||
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
|
||||
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group);
|
||||
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group);
|
||||
extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group);
|
||||
extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group);
|
||||
extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group);
|
||||
# endif
|
||||
|
||||
/* OpenMP TR11 routines to get memory spaces and allocators */
|
||||
|
||||
@@ -1072,6 +1072,7 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space;
|
||||
extern omp_memspace_handle_t const omp_const_mem_space;
|
||||
extern omp_memspace_handle_t const omp_high_bw_mem_space;
|
||||
extern omp_memspace_handle_t const omp_low_lat_mem_space;
|
||||
extern omp_memspace_handle_t const omp_cgroup_mem_space;
|
||||
extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
|
||||
extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
|
||||
extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
|
||||
|
||||
@@ -4515,6 +4515,20 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
|
||||
}
|
||||
/* end of OpenMP 5.1 Memory Management routines */
|
||||
|
||||
void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { return 0; }
|
||||
|
||||
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
|
||||
return omp_null_mem_space;
|
||||
}
|
||||
|
||||
int __kmpc_get_target_offload(void) {
|
||||
if (!__kmp_init_serial) {
|
||||
__kmp_serial_initialize();
|
||||
|
||||
@@ -333,6 +333,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
|
||||
(omp_memspace_handle_t const)3;
|
||||
omp_memspace_handle_t const omp_low_lat_mem_space =
|
||||
(omp_memspace_handle_t const)4;
|
||||
omp_memspace_handle_t const omp_cgroup_mem_space =
|
||||
(omp_memspace_handle_t const)5;
|
||||
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
|
||||
(omp_memspace_handle_t const)100;
|
||||
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
|
||||
|
||||
@@ -368,6 +368,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
|
||||
(omp_memspace_handle_t const)3;
|
||||
omp_memspace_handle_t const omp_low_lat_mem_space =
|
||||
(omp_memspace_handle_t const)4;
|
||||
omp_memspace_handle_t const omp_cgroup_mem_space =
|
||||
(omp_memspace_handle_t const)5;
|
||||
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
|
||||
(omp_memspace_handle_t const)100;
|
||||
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
|
||||
@@ -454,6 +456,31 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
|
||||
#endif
|
||||
}
|
||||
|
||||
void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
|
||||
i;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
|
||||
i;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t omp_get_dyn_gprivate_size(omp_access_t access_group) {
|
||||
i;
|
||||
return 0;
|
||||
}
|
||||
|
||||
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
|
||||
i;
|
||||
return omp_null_mem_space;
|
||||
}
|
||||
|
||||
size_t omp_get_gprivate_limit(int device_num, omp_access_t access_group) {
|
||||
i;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* OpenMP 5.0 Affinity Format */
|
||||
void omp_set_affinity_format(char const *format) { i; }
|
||||
size_t omp_get_affinity_format(char *buffer, size_t size) {
|
||||
|
||||
Reference in New Issue
Block a user