[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (#152831)

Part 3 adding offload runtime support. See
https://github.com/llvm/llvm-project/pull/152651.

---------

Co-authored-by: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
This commit is contained in:
Kevin Sala Penades
2026-03-12 01:13:06 -07:00
committed by GitHub
parent 11e0d6ae4b
commit 1f583c6dee
27 changed files with 613 additions and 65 deletions

View File

@@ -101,7 +101,7 @@ __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
__OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false, __OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr) ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
__OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false, __OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false,
Int32, Int32) VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
#undef __OMP_STRUCT_TYPE #undef __OMP_STRUCT_TYPE
#undef OMP_STRUCT_TYPE #undef OMP_STRUCT_TYPE

View File

@@ -102,8 +102,9 @@ struct KernelArgsTy {
struct { struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA. uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
uint64_t Unused : 62; uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
} Flags = {0, 0, 0}; uint64_t Unused : 60;
} Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension). // The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0}; uint32_t NumTeams[3] = {0, 0, 0};
// The number of threads (for x,y,z dimension). // The number of threads (for x,y,z dimension).

View File

@@ -70,10 +70,25 @@ struct KernelEnvironmentTy {
DynamicEnvironmentTy *DynamicEnv = nullptr; DynamicEnvironmentTy *DynamicEnv = nullptr;
}; };
/// The fallback types for the dynamic cgroup memory.
enum class DynCGroupMemFallbackType : uint8_t {
/// None. Used for indicating that no fallback was triggered.
None = 0,
/// Abort the execution.
Abort = None,
/// Return null pointer.
Null = 1,
/// Allocate from a implementation defined memory space.
DefaultMem = 2
};
struct KernelLaunchEnvironmentTy { struct KernelLaunchEnvironmentTy {
void *ReductionBuffer = nullptr;
void *DynCGroupMemFbPtr = nullptr;
uint32_t ReductionCnt = 0; uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0; uint32_t ReductionIterCnt = 0;
void *ReductionBuffer = nullptr; uint32_t DynCGroupMemSize = 0;
DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None;
}; };
#endif // OMPTARGET_SHARED_ENVIRONMENT_H #endif // OMPTARGET_SHARED_ENVIRONMENT_H

View File

@@ -37,6 +37,8 @@
#include "PluginInterface.h" #include "PluginInterface.h"
using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy; using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
using DeviceInfo = llvm::omp::target::plugin::DeviceInfo;
using InfoTreeNode = llvm::omp::target::plugin::InfoTreeNode;
// Forward declarations. // Forward declarations.
struct __tgt_bin_desc; struct __tgt_bin_desc;
@@ -167,6 +169,20 @@ struct DeviceTy {
/// Indicate that there are pending images for this device or not. /// Indicate that there are pending images for this device or not.
void setHasPendingImages(bool V) { HasPendingImages = V; } void setHasPendingImages(bool V) { HasPendingImages = V; }
/// Get information from the device.
template <typename T> T getInfo(DeviceInfo Info) const {
InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID);
auto EntryOpt = DevInfo.get(Info);
if (!EntryOpt)
return 0;
auto Entry = *EntryOpt;
if (!std::holds_alternative<T>(Entry->Value))
return T{};
return std::get<T>(Entry->Value);
}
private: private:
/// Deinitialize the device (and plugin). /// Deinitialize the device (and plugin).
void deinit(); void deinit();

View File

@@ -274,12 +274,23 @@ struct __tgt_target_non_contig {
extern "C" { extern "C" {
#endif #endif
/// The OpenMP access group type. The criterion for grouping tasks using a
/// specific grouping property.
enum omp_access_t {
/// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
/// Groups the tasks based on the parallel region to which they bind.
omp_access_pteam = 1,
};
void ompx_dump_mapping_tables(void); void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void); int omp_get_num_devices(void);
int omp_get_device_num(void); int omp_get_device_num(void);
int omp_get_device_from_uid(const char *DeviceUid); int omp_get_device_from_uid(const char *DeviceUid);
const char *omp_get_uid_from_device(int DeviceNum); const char *omp_get_uid_from_device(int DeviceNum);
int omp_get_initial_device(void); int omp_get_initial_device(void);
size_t omp_get_gprivate_limit(int DeviceNum,
omp_access_t AccessGroup = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum); void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum); void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum); int omp_target_is_present(const void *Ptr, int DeviceNum);

View File

@@ -138,6 +138,22 @@ EXTERN int omp_get_initial_device(void) {
return HostDevice; return HostDevice;
} }
EXTERN size_t omp_get_gprivate_limit(int DeviceNum, omp_access_t AccessGroup) {
TIMESCOPE();
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (DeviceNum == omp_get_initial_device())
return 0;
if (AccessGroup != omp_access_cgroup)
return 0;
auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
return DeviceOrErr->getInfo<uint64_t>(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) { EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) + TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
";size=" + std::to_string(Size)); ";size=" + std::to_string(Size));

View File

@@ -43,6 +43,7 @@ VERS1.0 {
omp_get_device_from_uid; omp_get_device_from_uid;
omp_get_uid_from_device; omp_get_uid_from_device;
omp_get_initial_device; omp_get_initial_device;
omp_get_gprivate_limit;
omp_target_alloc; omp_target_alloc;
omp_target_free; omp_target_free;
omp_target_is_accessible; omp_target_is_accessible;

View File

@@ -559,6 +559,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
return Err; return Err;
} }
// Set the static block memory size required by the kernel.
StaticBlockMemSize = GroupSize;
// Make sure it is a kernel symbol. // Make sure it is a kernel symbol.
if (SymbolType != HSA_SYMBOL_KIND_KERNEL) if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error(ErrorCode::INVALID_BINARY, return Plugin::error(ErrorCode::INVALID_BINARY,
@@ -582,8 +585,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Launch the AMDGPU kernel function. /// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
KernelLaunchParamsTy LaunchParams, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override; AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Return maximum block size for maximum occupancy /// Return maximum block size for maximum occupancy
@@ -3220,7 +3223,7 @@ private:
KernelArgsTy KernelArgs = {}; KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = AMDGPUKernel.launchImpl( if (auto Err = AMDGPUKernel.launchImpl(
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper)) KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err; return Err;
@@ -3755,6 +3758,7 @@ private:
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3], uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const { AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3767,13 +3771,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs)) if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
return Err; return Err;
// Account for user requested dynamic shared memory.
uint32_t GroupSize = getGroupSize();
if (uint32_t MaxDynCGroupMem = std::max(
KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
GroupSize += MaxDynCGroupMem;
}
uint64_t StackSize; uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err; return Err;
@@ -3825,9 +3822,17 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem); KernelArgs.DynCGroupMem);
} }
// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
// HSA requires the group segment size to include both static and dynamic.
uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
// Push the kernel launch into the stream. // Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
GroupSize, StackSize, ArgsMemoryManager); TotalBlockMemSize, StackSize,
ArgsMemoryManager);
} }
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,

View File

@@ -301,6 +301,18 @@ private:
} }
}; };
/// Configuration of dynamic block memory needed for launching a kernel.
struct DynBlockMemConfTy {
/// The size of the dynamic block memory buffer.
uint32_t Size = 0;
/// The size of dynamic shared memory natively provided by the device.
uint32_t NativeSize = 0;
/// The fallback that was triggered (if any).
DynCGroupMemFallbackType Fallback = DynCGroupMemFallbackType::None;
/// The fallback pointer if global memory was used as alternative.
void *FallbackPtr = nullptr;
};
/// Class wrapping a __tgt_device_image and its offload entry table on a /// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing /// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device. /// the offload entries for an image on a device.
@@ -363,7 +375,7 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const; AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice, virtual Error launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3], uint32_t NumThreads[3], uint32_t NumBlocks[3],
KernelArgsTy &KernelArgs, uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
@@ -373,6 +385,9 @@ struct GenericKernelTy {
/// Get the kernel name. /// Get the kernel name.
const char *getName() const { return Name.c_str(); } const char *getName() const { return Name.c_str(); }
/// Get the size of the static per-block memory consumed by the kernel.
uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
/// Get the kernel image. /// Get the kernel image.
DeviceImageTy &getImage() const { DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!"); assert(ImagePtr && "Kernel is not initialized!");
@@ -386,8 +401,10 @@ struct GenericKernelTy {
/// Return a device pointer to a new kernel launch environment. /// Return a device pointer to a new kernel launch environment.
Expected<KernelLaunchEnvironmentTy *> Expected<KernelLaunchEnvironmentTy *>
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version, getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
AsyncInfoWrapperTy &AsyncInfo) const; const KernelArgsTy &KernelArgs,
const DynBlockMemConfTy &DynBlockMemConf,
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
/// Indicate whether an execution mode is valid. /// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -433,6 +450,12 @@ protected:
uint32_t NumBlocks[3]) const; uint32_t NumBlocks[3]) const;
private: private:
/// Prepare the block memory buffer requested for the kernel and execute the
/// specified fallback if necessary.
Expected<DynBlockMemConfTy> prepareBlockMemory(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumBlocks) const;
/// Prepare the arguments before launching the kernel. /// Prepare the arguments before launching the kernel.
KernelLaunchParamsTy KernelLaunchParamsTy
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
@@ -487,6 +510,9 @@ protected:
/// The maximum number of threads which the kernel could leverage. /// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads; uint32_t MaxNumThreads;
/// The static memory sized per block.
uint32_t StaticBlockMemSize = 0;
/// The kernel environment, including execution flags. /// The kernel environment, including execution flags.
KernelEnvironmentTy KernelEnvironment; KernelEnvironmentTy KernelEnvironment;
@@ -1498,6 +1524,9 @@ public:
/// Query the current state of an asynchronous queue. /// Query the current state of an asynchronous queue.
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr); int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
/// Obtain information about the given device.
InfoTreeNode obtain_device_info(int32_t DeviceId);
/// Prints information about the given devices supported by the plugin. /// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId); void print_device_info(int32_t DeviceId);

View File

@@ -435,20 +435,21 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
Expected<KernelLaunchEnvironmentTy *> Expected<KernelLaunchEnvironmentTy *>
GenericKernelTy::getKernelLaunchEnvironment( GenericKernelTy::getKernelLaunchEnvironment(
GenericDeviceTy &GenericDevice, uint32_t Version, GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
const DynBlockMemConfTy &DynBlockMemConf,
AsyncInfoWrapperTy &AsyncInfoWrapper) const { AsyncInfoWrapperTy &AsyncInfoWrapper) const {
// Ctor/Dtor have no arguments, replaying uses the original kernel launch // Ctor/Dtor have no arguments, replaying uses the original kernel launch
// environment. Older versions of the compiler do not generate a kernel // environment. Older versions of the compiler do not generate a kernel
// launch environment. // launch environment.
if (GenericDevice.Plugin.getRecordReplay().isReplaying() || if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
return nullptr; return nullptr;
if (!KernelEnvironment.Configuration.ReductionDataSize || if ((!KernelEnvironment.Configuration.ReductionDataSize ||
!KernelEnvironment.Configuration.ReductionBufferLength) !KernelEnvironment.Configuration.ReductionBufferLength) &&
KernelArgs.DynCGroupMem == 0)
return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0); return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
// TODO: Check if the kernel needs a launch environment.
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy), auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
/*HostPtr=*/nullptr, /*HostPtr=*/nullptr,
TargetAllocTy::TARGET_ALLOC_DEVICE); TargetAllocTy::TARGET_ALLOC_DEVICE);
@@ -462,7 +463,14 @@ GenericKernelTy::getKernelLaunchEnvironment(
/// async data transfer. /// async data transfer.
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment; auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
LocalKLE = KernelLaunchEnvironment; LocalKLE = KernelLaunchEnvironment;
{
LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size;
LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr;
LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback;
LocalKLE.ReductionBuffer = nullptr;
if (KernelEnvironment.Configuration.ReductionDataSize &&
KernelEnvironment.Configuration.ReductionBufferLength) {
auto AllocOrErr = GenericDevice.dataAlloc( auto AllocOrErr = GenericDevice.dataAlloc(
KernelEnvironment.Configuration.ReductionDataSize * KernelEnvironment.Configuration.ReductionDataSize *
KernelEnvironment.Configuration.ReductionBufferLength, KernelEnvironment.Configuration.ReductionBufferLength,
@@ -508,14 +516,81 @@ Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
return Plugin::success(); return Plugin::success();
} }
Expected<DynBlockMemConfTy>
GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumBlocks) const {
uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
uint32_t DynNativeBlockMemSize = DynBlockMemSize;
void *DynFallbackPtr = nullptr;
// No enough block memory to cover the static one. Cannot run the kernel.
if (StaticBlockMemSize > MaxBlockMemSize)
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"Static block memory size exceeds maximum");
// No enough block memory to cover dynamic one, and the fallback is aborting.
if (static_cast<DynCGroupMemFallbackType>(
KernelArgs.Flags.DynCGroupMemFallback) ==
DynCGroupMemFallbackType::Abort &&
TotalBlockMemSize > MaxBlockMemSize)
return Plugin::error(
ErrorCode::INVALID_ARGUMENT,
"Requested block memory size (static + dynamic) exceeds maximum");
DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None;
if (DynBlockMemSize && TotalBlockMemSize > MaxBlockMemSize) {
// Launch without native dynamic block memory.
DynNativeBlockMemSize = 0;
DynFallback = static_cast<DynCGroupMemFallbackType>(
KernelArgs.Flags.DynCGroupMemFallback);
if (DynFallback != DynCGroupMemFallbackType::DefaultMem) {
// Do not provide any memory as fallback.
DynBlockMemSize = 0;
} else {
// Get global memory as fallback.
auto AllocOrErr = GenericDevice.dataAlloc(
NumBlocks * DynBlockMemSize,
/*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
if (!AllocOrErr)
return AllocOrErr.takeError();
DynFallbackPtr = *AllocOrErr;
}
}
return DynBlockMemConfTy{DynBlockMemSize, DynNativeBlockMemSize, DynFallback,
DynFallbackPtr};
}
Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
AsyncInfoWrapperTy &AsyncInfoWrapper) const { AsyncInfoWrapperTy &AsyncInfoWrapper) const {
llvm::SmallVector<void *, 16> Args; llvm::SmallVector<void *, 16> Args;
llvm::SmallVector<void *, 16> Ptrs; llvm::SmallVector<void *, 16> Ptrs;
uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
KernelArgs.ThreadLimit[1],
KernelArgs.ThreadLimit[2]};
uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
KernelArgs.NumTeams[2]};
if (!isBareMode()) {
NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
}
auto DynBlockMemConfOrErr =
prepareBlockMemory(GenericDevice, KernelArgs, NumBlocks[0]);
if (!DynBlockMemConfOrErr)
return DynBlockMemConfOrErr.takeError();
DynBlockMemConfTy &DynBlockMemConf = *DynBlockMemConfOrErr;
if (DynBlockMemConf.FallbackPtr)
AsyncInfoWrapper.freeAllocationAfterSynchronization(
DynBlockMemConf.FallbackPtr);
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment( auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
GenericDevice, KernelArgs.Version, AsyncInfoWrapper); GenericDevice, KernelArgs, DynBlockMemConf, AsyncInfoWrapper);
if (!KernelLaunchEnvOrErr) if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError(); return KernelLaunchEnvOrErr.takeError();
@@ -531,17 +606,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
Args, Ptrs, *KernelLaunchEnvOrErr); Args, Ptrs, *KernelLaunchEnvOrErr);
} }
uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
KernelArgs.ThreadLimit[1],
KernelArgs.ThreadLimit[2]};
uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
KernelArgs.NumTeams[2]};
if (!isBareMode()) {
NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
}
// Record the kernel description after we modified the argument count and num // Record the kernel description after we modified the argument count and num
// blocks/threads. // blocks/threads.
RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay(); RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
@@ -557,8 +621,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks)) printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err; return Err;
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, return launchImpl(GenericDevice, NumThreads, NumBlocks,
LaunchParams, AsyncInfoWrapper); DynBlockMemConf.NativeSize, KernelArgs, LaunchParams,
AsyncInfoWrapper);
} }
KernelLaunchParamsTy GenericKernelTy::prepareArgs( KernelLaunchParamsTy GenericKernelTy::prepareArgs(
@@ -1954,6 +2019,16 @@ int32_t GenericPluginTy::query_async(int32_t DeviceId,
return OFFLOAD_SUCCESS; return OFFLOAD_SUCCESS;
} }
InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
auto InfoOrErr = getDevice(DeviceId).obtainInfo();
if (auto Err = InfoOrErr.takeError()) {
REPORT() << "Failure to obtain device " << DeviceId
<< " info: " << toString(std::move(Err));
return InfoTreeNode{};
}
return std::move(*InfoOrErr);
}
void GenericPluginTy::print_device_info(int32_t DeviceId) { void GenericPluginTy::print_device_info(int32_t DeviceId) {
if (auto Err = getDevice(DeviceId).printInfo()) if (auto Err = getDevice(DeviceId).printInfo())
REPORT() << "Failure to print device " << DeviceId REPORT() << "Failure to print device " << DeviceId

View File

@@ -261,6 +261,7 @@ typedef enum CUdevice_attribute_enum {
typedef enum CUfunction_attribute_enum { typedef enum CUfunction_attribute_enum {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
} CUfunction_attribute; } CUfunction_attribute;

View File

@@ -150,14 +150,23 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel. // The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
int SharedMemSize;
Res = cuFuncGetAttribute(&SharedMemSize,
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func);
if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
return Err;
// Set the static block memory size required by the kernel.
StaticBlockMemSize = SharedMemSize;
// Retrieve the size of the arguments. // Retrieve the size of the arguments.
return initArgsSize(); return initArgsSize();
} }
/// Launch the CUDA kernel function. /// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
KernelLaunchParamsTy LaunchParams, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override; AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Return maximum block size for maximum occupancy /// Return maximum block size for maximum occupancy
@@ -197,7 +206,7 @@ private:
CUfunction Func; CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default, /// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB. /// this is set to 48 KB.
mutable uint32_t MaxDynCGroupMemLimit = 49152; mutable uint32_t MaxDynBlockMemSize = 49152;
/// The size of the kernel arguments. /// The size of the kernel arguments.
size_t ArgsSize; size_t ArgsSize;
@@ -1411,7 +1420,7 @@ private:
KernelArgsTy KernelArgs = {}; KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = CUDAKernel.launchImpl( if (auto Err = CUDAKernel.launchImpl(
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper)) KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err; return Err;
@@ -1455,6 +1464,7 @@ private:
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3], uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const { AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -1470,9 +1480,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream)) if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err; return Err;
uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
size_t ConfigArgsSize = ArgsSize; size_t ConfigArgsSize = ArgsSize;
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE, CU_LAUNCH_PARAM_BUFFER_SIZE,
@@ -1484,20 +1491,24 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (GenericDevice.getRPCServer()) if (GenericDevice.getRPCServer())
GenericDevice.Plugin.getRPCServer().Thread->notify(); GenericDevice.Plugin.getRPCServer().Thread->notify();
// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
// In case we require more memory than the current limit. // In case we require more memory than the current limit.
if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) { if (DynBlockMemSize >= MaxDynBlockMemSize) {
CUresult AttrResult = cuFuncSetAttribute( CUresult AttrResult = cuFuncSetAttribute(
Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem); Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize);
if (auto Err = Plugin::check( if (auto Err = Plugin::check(
AttrResult, AttrResult,
"error in cuFuncSetAttribute while setting the memory limits: %s")) "error in cuFuncSetAttribute while setting the memory limits: %s"))
return Err; return Err;
MaxDynCGroupMemLimit = MaxDynCGroupMem; MaxDynBlockMemSize = DynBlockMemSize;
} }
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2], CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
NumThreads[0], NumThreads[1], NumThreads[2], NumThreads[0], NumThreads[1], NumThreads[2],
MaxDynCGroupMem, Stream, nullptr, Config); DynBlockMemSize, Stream, nullptr, Config);
// Register a callback to indicate when the kernel is complete. // Register a callback to indicate when the kernel is complete.
if (GenericDevice.getRPCServer()) if (GenericDevice.getRPCServer())

View File

@@ -97,8 +97,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
/// Launch the kernel using the libffi. /// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
KernelLaunchParamsTy LaunchParams, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override { AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
if (!SupportsFFI) if (!SupportsFFI)
return Plugin::error(ErrorCode::UNSUPPORTED, return Plugin::error(ErrorCode::UNSUPPORTED,

View File

@@ -124,8 +124,8 @@ public:
Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override; Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
/// Launch the L0 kernel function. /// Launch the L0 kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
KernelLaunchParamsTy LaunchParams, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override; AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
Error deinit() { Error deinit() {
CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel); CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel);

View File

@@ -413,9 +413,13 @@ Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice, Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3], uint32_t NumThreads[3], uint32_t NumBlocks[3],
KernelArgsTy &KernelArgs, uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const { AsyncInfoWrapperTy &AsyncInfoWrapper) const {
if (DynBlockMemSize > 0)
return Plugin::error(ErrorCode::UNSUPPORTED,
"dynamic shared memory is unsupported in L0 plugin");
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice); auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
__tgt_async_info *AsyncInfo = AsyncInfoWrapper; __tgt_async_info *AsyncInfo = AsyncInfoWrapper;

View File

@@ -0,0 +1,199 @@
// RUN: %libomptarget-compilexx-generic -fopenmp-version=61
// RUN: %libomptarget-run-generic | %fcheck-generic
// RUN: %libomptarget-compileoptxx-generic -fopenmp-version=61
// RUN: %libomptarget-run-generic | %fcheck-generic
// REQUIRES: gpu
// UNSUPPORTED: intelgpu
#include <omp.h>
#include <stdio.h>
#define N 512
int main() {
int Result[N], NumThreads;
// Verify the groupprivate buffer works as expected.
#pragma omp target teams num_teams(1) thread_limit(N) \
dyn_groupprivate(fallback(abort) : N * sizeof(Result[0])) \
map(from : Result, NumThreads)
{
int Buffer[N];
#pragma omp parallel
{
int *DynBuffer = (int *)omp_get_dyn_gprivate_nofb_ptr();
int TId = omp_get_thread_num();
if (TId == 0)
NumThreads = omp_get_num_threads();
Buffer[TId] = 7;
DynBuffer[TId] = 3;
#pragma omp barrier
int WrappedTId = (TId + 37) % NumThreads;
Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
}
}
if (NumThreads < N / 2 || NumThreads > N) {
printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
NumThreads);
return -1;
}
int Failed = 0;
for (int i = 0; i < NumThreads; ++i) {
if (Result[i] != 7 + 3) {
printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
++Failed;
}
}
// Verify that the routines in the host returns NULL and zero.
if (omp_get_dyn_gprivate_ptr())
++Failed;
if (omp_get_dyn_gprivate_nofb_ptr())
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
size_t MaxSize = omp_get_gprivate_limit(0, omp_access_cgroup);
size_t ExceededSize = MaxSize + 10;
// Verify that the fallback(default_mem) modifier works.
#pragma omp target dyn_groupprivate(fallback(default_mem) : ExceededSize) \
map(tofrom : Failed)
{
if (!omp_get_dyn_gprivate_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (!omp_get_dyn_gprivate_size())
++Failed;
if (omp_get_dyn_gprivate_size() != ExceededSize)
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
++Failed;
}
// Verify that the fallback(null) modifier works.
#pragma omp target dyn_groupprivate(fallback(null) : ExceededSize) \
map(tofrom : Failed)
{
if (omp_get_dyn_gprivate_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
++Failed;
}
// Verify that the default modifier is fallback(default_mem).
#pragma omp target dyn_groupprivate(ExceededSize)
{
if (!omp_get_dyn_gprivate_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (!omp_get_dyn_gprivate_size())
++Failed;
if (omp_get_dyn_gprivate_size() != ExceededSize)
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
++Failed;
}
// Verify that the fallback(abort) modifier works.
#pragma omp target dyn_groupprivate(fallback(abort) : N) map(tofrom : Failed)
{
if (!omp_get_dyn_gprivate_ptr(0))
++Failed;
if (!omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(5) != omp_get_dyn_gprivate_nofb_ptr(5))
++Failed;
if (!omp_get_dyn_gprivate_size())
++Failed;
if (omp_get_dyn_gprivate_size() != N)
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
++Failed;
}
// Verify that the fallback(default_mem) does not trigger when not needed.
#pragma omp target dyn_groupprivate(fallback(default_mem) : N) \
map(tofrom : Failed)
{
if (!omp_get_dyn_gprivate_ptr(0))
++Failed;
if (!omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (!omp_get_dyn_gprivate_size())
++Failed;
if (omp_get_dyn_gprivate_size() != N)
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
++Failed;
}
// Verify that the clause works when passing a zero size.
#pragma omp target dyn_groupprivate(fallback(abort) : 0) map(tofrom : Failed)
{
if (omp_get_dyn_gprivate_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
++Failed;
}
// Verify that the clause works when passing a zero size and
// fallback(default_mem).
#pragma omp target dyn_groupprivate(fallback(default_mem) : 0) \
map(tofrom : Failed)
{
if (omp_get_dyn_gprivate_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
++Failed;
}
// Verify that omitting the clause is the same as setting zero size.
#pragma omp target map(tofrom : Failed)
{
if (omp_get_dyn_gprivate_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
++Failed;
}
// CHECK: PASS
if (!Failed)
printf("PASS\n");
}

View File

@@ -171,9 +171,29 @@ typedef enum omp_allocator_handle_t {
KMP_ALLOCATOR_MAX_HANDLE = ~(0LU) KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
} omp_allocator_handle_t; } omp_allocator_handle_t;
typedef enum omp_memspace_handle_t {
omp_null_mem_space = 0,
omp_default_mem_space = 99,
omp_large_cap_mem_space = 1,
omp_const_mem_space = 2,
omp_high_bw_mem_space = 3,
omp_low_lat_mem_space = 4,
omp_cgroup_mem_space = 5,
KMP_MEMSPACE_MAX_HANDLE = ~(0LU)
} omp_memspace_handle_t;
#define __PRAGMA(STR) _Pragma(#STR) #define __PRAGMA(STR) _Pragma(#STR)
#define OMP_PRAGMA(STR) __PRAGMA(omp STR) #define OMP_PRAGMA(STR) __PRAGMA(omp STR)
///} ///}
/// The OpenMP access group type. The criterion for grupping tasks using a
/// specific grouping property.
enum omp_access_t {
/// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
/// Groups the tasks based on the parallel region to which they bind.
omp_access_pteam = 1,
};
#endif #endif

View File

@@ -226,7 +226,7 @@ struct KernelEnvironmentTy;
int8_t __kmpc_is_spmd_exec_mode(); int8_t __kmpc_is_spmd_exec_mode();
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment); KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
void __kmpc_target_deinit(); void __kmpc_target_deinit();

View File

@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
/// Initialize the state machinery. Must be called by all threads. /// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment); KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
/// Return the kernel and kernel launch environment associated with the current /// Return the kernel and kernel launch environment associated with the current
/// kernel. The former is static and contains compile time information that /// kernel. The former is static and contains compile time information that

View File

@@ -35,8 +35,8 @@ enum OMPTgtExecModeFlags : unsigned char {
}; };
static void static void
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
// Order is important here. // Order is important here.
synchronize::init(IsSPMD); synchronize::init(IsSPMD);
mapping::init(IsSPMD); mapping::init(IsSPMD);
@@ -80,17 +80,17 @@ extern "C" {
/// \param Ident Source location identification, can be NULL. /// \param Ident Source location identification, can be NULL.
/// ///
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
bool UseGenericStateMachine = Configuration.UseGenericStateMachine; bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
if (IsSPMD) { if (IsSPMD) {
inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
KernelLaunchEnvironment); KernelLaunchEnvironment);
synchronize::threadsAligned(atomic::relaxed); synchronize::threadsAligned(atomic::relaxed);
} else { } else {
inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
KernelLaunchEnvironment); KernelLaunchEnvironment);
// No need to wait since only the main threads will execute user // No need to wait since only the main threads will execute user
// code and workers will run into a barrier right away. // code and workers will run into a barrier right away.
} }

View File

@@ -40,6 +40,10 @@ using namespace ompx;
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *> [[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
KernelLaunchEnvironmentPtr; KernelLaunchEnvironmentPtr;
/// The pointer type for dynamic shared memory. This is important to keep
/// the alignment and address space information.
using SharedMemPtrTy = decltype(&DynamicSharedBuffer[0]);
///} ///}
namespace { namespace {
@@ -138,6 +142,60 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
} }
/// Manager of the dynamic cgroup memory buffer.
struct DynCGroupMemTy {
/// Initialize the manager with the information from the kernel launch
/// enviornment and the pointer to the native shared memory buffer.
void init(KernelLaunchEnvironmentTy *KLE, SharedMemPtrTy NativePtr) {
// Initialize default values.
NativeOrNullPtr = nullptr;
FallbackPtr = nullptr;
Size = 0;
Fallback = DynCGroupMemFallbackType::None;
if (!KLE)
return;
// Initialize values using the kernel launch environment.
Size = KLE->DynCGroupMemSize;
Fallback = KLE->DynCGroupMemFb;
if (Size && Fallback == DynCGroupMemFallbackType::None)
NativeOrNullPtr = NativePtr;
if (Fallback == DynCGroupMemFallbackType::DefaultMem)
FallbackPtr = static_cast<unsigned char *>(KLE->DynCGroupMemFbPtr) +
Size * mapping::getBlockIdInKernel();
}
/// Get the memory space of the buffer.
omp_memspace_handle_t getMemSpace() const {
if (Size == 0)
return omp_null_mem_space;
if (Fallback == DynCGroupMemFallbackType::None)
return omp_cgroup_mem_space;
return omp_default_mem_space;
}
/// Get the size of the buffer.
size_t getSize() const { return Size; }
/// Get the native pointer or null if it was a fallback.
SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; }
/// Get the native pointer or the fallback pointer.
unsigned char *getNativeOrFallbackPtr() const {
return (Fallback == DynCGroupMemFallbackType::DefaultMem)
? FallbackPtr
: getNativeOrNullPtr();
}
private:
SharedMemPtrTy NativeOrNullPtr;
unsigned char *FallbackPtr;
size_t Size;
DynCGroupMemFallbackType Fallback;
};
[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
} // namespace } // namespace
void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
@@ -226,13 +284,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
} // namespace } // namespace
void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { KernelLaunchEnvironmentTy *KLE) {
SharedMemorySmartStack.init(IsSPMD); SharedMemorySmartStack.init(IsSPMD);
if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
KLE = nullptr;
if (mapping::isInitialThreadInLevel0(IsSPMD)) { if (mapping::isInitialThreadInLevel0(IsSPMD)) {
DynCGroupMem.init(KLE, DynamicSharedBuffer);
TeamState.init(IsSPMD); TeamState.init(IsSPMD);
ThreadStates = nullptr; ThreadStates = nullptr;
KernelEnvironmentPtr = &KernelEnvironment; KernelEnvironmentPtr = &KernelEnvironment;
KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; KernelLaunchEnvironmentPtr = KLE;
} }
} }
@@ -416,6 +479,25 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
int omp_get_initial_device(void) { return -1; } int omp_get_initial_device(void) { return -1; }
int omp_is_initial_device(void) { return 0; } int omp_is_initial_device(void) { return 0; }
void *omp_get_dyn_gprivate_ptr(size_t Offset, omp_access_t) {
return DynCGroupMem.getNativeOrFallbackPtr() + Offset;
}
void *omp_get_dyn_gprivate_nofb_ptr(size_t Offset, omp_access_t) {
unsigned char *Ptr = DynCGroupMem.getNativeOrNullPtr();
// Ensure the alignment and address space information is kept.
Ptr = (unsigned char *)__builtin_assume_aligned(Ptr, allocator::ALIGNMENT);
return (SharedMemPtrTy)(Ptr + Offset);
}
size_t omp_get_dyn_gprivate_size(omp_access_t) {
return DynCGroupMem.getSize();
}
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t) {
return DynCGroupMem.getMemSpace();
}
} }
extern "C" { extern "C" {

View File

@@ -607,6 +607,7 @@ kmp_set_disp_num_buffers 890
llvm_omp_target_shared_mem_space DATA llvm_omp_target_shared_mem_space DATA
llvm_omp_target_device_mem_space DATA llvm_omp_target_device_mem_space DATA
omp_null_mem_space DATA omp_null_mem_space DATA
omp_cgroup_mem_space DATA
%ifndef stub %ifndef stub
# Ordinals between 900 and 999 are reserved # Ordinals between 900 and 999 are reserved

View File

@@ -380,6 +380,11 @@
omp_uintptr_t value; omp_uintptr_t value;
} omp_alloctrait_t; } omp_alloctrait_t;
typedef enum {
omp_access_cgroup = 0,
omp_access_pteam = 1
} omp_access_t;
# if defined(_WIN32) # if defined(_WIN32)
// On Windows cl and icl do not support 64-bit enum, let's use integer then. // On Windows cl and icl do not support 64-bit enum, let's use integer then.
typedef omp_uintptr_t omp_allocator_handle_t; typedef omp_uintptr_t omp_allocator_handle_t;
@@ -403,6 +408,7 @@
extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
extern __KMP_IMP omp_memspace_handle_t const omp_cgroup_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space; extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space; extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space; extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
@@ -439,6 +445,7 @@
omp_const_mem_space = 2, omp_const_mem_space = 2,
omp_high_bw_mem_space = 3, omp_high_bw_mem_space = 3,
omp_low_lat_mem_space = 4, omp_low_lat_mem_space = 4,
omp_cgroup_mem_space = 5,
llvm_omp_target_host_mem_space = 100, llvm_omp_target_host_mem_space = 100,
llvm_omp_target_shared_mem_space = 101, llvm_omp_target_shared_mem_space = 101,
llvm_omp_target_device_mem_space = 102, llvm_omp_target_device_mem_space = 102,
@@ -463,6 +470,11 @@
omp_allocator_handle_t allocator = omp_null_allocator, omp_allocator_handle_t allocator = omp_null_allocator,
omp_allocator_handle_t free_allocator = omp_null_allocator); omp_allocator_handle_t free_allocator = omp_null_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator); extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group = omp_access_cgroup);
extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group = omp_access_cgroup);
extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
# else # else
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a); extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size, extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
@@ -473,6 +485,11 @@
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator, extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
omp_allocator_handle_t free_allocator); omp_allocator_handle_t free_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a); extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group);
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group);
extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group);
extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group);
extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group);
# endif # endif
/* OpenMP TR11 routines to get memory spaces and allocators */ /* OpenMP TR11 routines to get memory spaces and allocators */

View File

@@ -1072,6 +1072,7 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space;
extern omp_memspace_handle_t const omp_const_mem_space; extern omp_memspace_handle_t const omp_const_mem_space;
extern omp_memspace_handle_t const omp_high_bw_mem_space; extern omp_memspace_handle_t const omp_high_bw_mem_space;
extern omp_memspace_handle_t const omp_low_lat_mem_space; extern omp_memspace_handle_t const omp_low_lat_mem_space;
extern omp_memspace_handle_t const omp_cgroup_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_host_mem_space; extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space; extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_device_mem_space; extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;

View File

@@ -4515,6 +4515,20 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
} }
/* end of OpenMP 5.1 Memory Management routines */ /* end of OpenMP 5.1 Memory Management routines */
void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
return NULL;
}
void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
return NULL;
}
size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { return 0; }
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
return omp_null_mem_space;
}
int __kmpc_get_target_offload(void) { int __kmpc_get_target_offload(void) {
if (!__kmp_init_serial) { if (!__kmp_init_serial) {
__kmp_serial_initialize(); __kmp_serial_initialize();

View File

@@ -333,6 +333,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
(omp_memspace_handle_t const)3; (omp_memspace_handle_t const)3;
omp_memspace_handle_t const omp_low_lat_mem_space = omp_memspace_handle_t const omp_low_lat_mem_space =
(omp_memspace_handle_t const)4; (omp_memspace_handle_t const)4;
omp_memspace_handle_t const omp_cgroup_mem_space =
(omp_memspace_handle_t const)5;
omp_memspace_handle_t const llvm_omp_target_host_mem_space = omp_memspace_handle_t const llvm_omp_target_host_mem_space =
(omp_memspace_handle_t const)100; (omp_memspace_handle_t const)100;
omp_memspace_handle_t const llvm_omp_target_shared_mem_space = omp_memspace_handle_t const llvm_omp_target_shared_mem_space =

View File

@@ -368,6 +368,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
(omp_memspace_handle_t const)3; (omp_memspace_handle_t const)3;
omp_memspace_handle_t const omp_low_lat_mem_space = omp_memspace_handle_t const omp_low_lat_mem_space =
(omp_memspace_handle_t const)4; (omp_memspace_handle_t const)4;
omp_memspace_handle_t const omp_cgroup_mem_space =
(omp_memspace_handle_t const)5;
omp_memspace_handle_t const llvm_omp_target_host_mem_space = omp_memspace_handle_t const llvm_omp_target_host_mem_space =
(omp_memspace_handle_t const)100; (omp_memspace_handle_t const)100;
omp_memspace_handle_t const llvm_omp_target_shared_mem_space = omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
@@ -454,6 +456,31 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
#endif #endif
} }
void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
i;
return NULL;
}
void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
i;
return NULL;
}
size_t omp_get_dyn_gprivate_size(omp_access_t access_group) {
i;
return 0;
}
omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
i;
return omp_null_mem_space;
}
size_t omp_get_gprivate_limit(int device_num, omp_access_t access_group) {
i;
return 0;
}
/* OpenMP 5.0 Affinity Format */ /* OpenMP 5.0 Affinity Format */
void omp_set_affinity_format(char const *format) { i; } void omp_set_affinity_format(char const *format) { i; }
size_t omp_get_affinity_format(char *buffer, size_t size) { size_t omp_get_affinity_format(char *buffer, size_t size) {