From 1f583c6dee360b0f5837a1026f2c594643cf885c Mon Sep 17 00:00:00 2001 From: Kevin Sala Penades Date: Thu, 12 Mar 2026 01:13:06 -0700 Subject: [PATCH] [OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (#152831) Part 3 adding offload runtime support. See https://github.com/llvm/llvm-project/pull/152651. --------- Co-authored-by: Krzysztof Parzyszek --- .../include/llvm/Frontend/OpenMP/OMPKinds.def | 2 +- offload/include/Shared/APITypes.h | 5 +- offload/include/Shared/Environment.h | 17 +- offload/include/device.h | 16 ++ offload/include/omptarget.h | 11 + offload/libomptarget/OpenMP/API.cpp | 16 ++ offload/libomptarget/exports | 1 + offload/plugins-nextgen/amdgpu/src/rtl.cpp | 27 ++- .../common/include/PluginInterface.h | 35 ++- .../common/src/PluginInterface.cpp | 115 ++++++++-- .../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 1 + offload/plugins-nextgen/cuda/src/rtl.cpp | 33 ++- offload/plugins-nextgen/host/src/rtl.cpp | 4 +- .../level_zero/include/L0Kernel.h | 4 +- .../level_zero/src/L0Kernel.cpp | 6 +- offload/test/offloading/dyn_groupprivate.cpp | 199 ++++++++++++++++++ openmp/device/include/DeviceTypes.h | 20 ++ openmp/device/include/Interface.h | 2 +- openmp/device/include/State.h | 2 +- openmp/device/src/Kernel.cpp | 14 +- openmp/device/src/State.cpp | 86 +++++++- openmp/runtime/src/dllexports | 1 + openmp/runtime/src/include/omp.h.var | 17 ++ openmp/runtime/src/kmp.h | 1 + openmp/runtime/src/kmp_csupport.cpp | 14 ++ openmp/runtime/src/kmp_global.cpp | 2 + openmp/runtime/src/kmp_stub.cpp | 27 +++ 27 files changed, 613 insertions(+), 65 deletions(-) create mode 100644 offload/test/offloading/dyn_groupprivate.cpp diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 152a8f727310..5fe7ee899724 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -101,7 +101,7 @@ __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16) __OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false, ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr) __OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false, - Int32, Int32) + VoidPtr, VoidPtr, Int32, Int32, Int32, Int8) #undef __OMP_STRUCT_TYPE #undef OMP_STRUCT_TYPE diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 8c150b6bfc2d..6183686290bd 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -102,8 +102,9 @@ struct KernelArgsTy { struct { uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA. - uint64_t Unused : 62; - } Flags = {0, 0, 0}; + uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem. + uint64_t Unused : 60; + } Flags = {0, 0, 0, 0}; // The number of teams (for x,y,z dimension). uint32_t NumTeams[3] = {0, 0, 0}; // The number of threads (for x,y,z dimension). diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index 79e45fd8e082..142fba40340e 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -70,10 +70,25 @@ struct KernelEnvironmentTy { DynamicEnvironmentTy *DynamicEnv = nullptr; }; +/// The fallback types for the dynamic cgroup memory. +enum class DynCGroupMemFallbackType : uint8_t { + /// None. Used for indicating that no fallback was triggered. + None = 0, + /// Abort the execution. + Abort = None, + /// Return null pointer. + Null = 1, + /// Allocate from a implementation defined memory space. + DefaultMem = 2 +}; + struct KernelLaunchEnvironmentTy { + void *ReductionBuffer = nullptr; + void *DynCGroupMemFbPtr = nullptr; uint32_t ReductionCnt = 0; uint32_t ReductionIterCnt = 0; - void *ReductionBuffer = nullptr; + uint32_t DynCGroupMemSize = 0; + DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None; }; #endif // OMPTARGET_SHARED_ENVIRONMENT_H diff --git a/offload/include/device.h b/offload/include/device.h index 4e27943d1dbc..06d21397c737 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -37,6 +37,8 @@ #include "PluginInterface.h" using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy; +using DeviceInfo = llvm::omp::target::plugin::DeviceInfo; +using InfoTreeNode = llvm::omp::target::plugin::InfoTreeNode; // Forward declarations. struct __tgt_bin_desc; @@ -167,6 +169,20 @@ struct DeviceTy { /// Indicate that there are pending images for this device or not. void setHasPendingImages(bool V) { HasPendingImages = V; } + /// Get information from the device. + template T getInfo(DeviceInfo Info) const { + InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID); + + auto EntryOpt = DevInfo.get(Info); + if (!EntryOpt) + return 0; + + auto Entry = *EntryOpt; + if (!std::holds_alternative(Entry->Value)) + return T{}; + return std::get(Entry->Value); + } + private: /// Deinitialize the device (and plugin). void deinit(); diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 867ba8d5e9f1..40c16a4a7580 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -274,12 +274,23 @@ struct __tgt_target_non_contig { extern "C" { #endif +/// The OpenMP access group type. The criterion for grouping tasks using a +/// specific grouping property. +enum omp_access_t { + /// Groups the tasks based on the contention group to which they belong. + omp_access_cgroup = 0, + /// Groups the tasks based on the parallel region to which they bind. + omp_access_pteam = 1, +}; + void ompx_dump_mapping_tables(void); int omp_get_num_devices(void); int omp_get_device_num(void); int omp_get_device_from_uid(const char *DeviceUid); const char *omp_get_uid_from_device(int DeviceNum); int omp_get_initial_device(void); +size_t omp_get_gprivate_limit(int DeviceNum, + omp_access_t AccessGroup = omp_access_cgroup); void *omp_target_alloc(size_t Size, int DeviceNum); void omp_target_free(void *DevicePtr, int DeviceNum); int omp_target_is_present(const void *Ptr, int DeviceNum); diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp index dddd494fa7aa..6dcd94e48e98 100644 --- a/offload/libomptarget/OpenMP/API.cpp +++ b/offload/libomptarget/OpenMP/API.cpp @@ -138,6 +138,22 @@ EXTERN int omp_get_initial_device(void) { return HostDevice; } +EXTERN size_t omp_get_gprivate_limit(int DeviceNum, omp_access_t AccessGroup) { + TIMESCOPE(); + OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); + if (DeviceNum == omp_get_initial_device()) + return 0; + + if (AccessGroup != omp_access_cgroup) + return 0; + + auto DeviceOrErr = PM->getDevice(DeviceNum); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); + + return DeviceOrErr->getInfo(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); +} + EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) { TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) + ";size=" + std::to_string(Size)); diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports index fccf57683b5b..1831c43cc5f2 100644 --- a/offload/libomptarget/exports +++ b/offload/libomptarget/exports @@ -43,6 +43,7 @@ VERS1.0 { omp_get_device_from_uid; omp_get_uid_from_device; omp_get_initial_device; + omp_get_gprivate_limit; omp_target_alloc; omp_target_free; omp_target_is_accessible; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index bfd07b0919d0..37d7c6345f02 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -559,6 +559,9 @@ struct AMDGPUKernelTy : public GenericKernelTy { return Err; } + // Set the static block memory size required by the kernel. + StaticBlockMemSize = GroupSize; + // Make sure it is a kernel symbol. if (SymbolType != HSA_SYMBOL_KIND_KERNEL) return Plugin::error(ErrorCode::INVALID_BINARY, @@ -582,8 +585,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Launch the AMDGPU kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], - uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, - KernelLaunchParamsTy LaunchParams, + uint32_t NumBlocks[3], uint32_t DynBlockMemSize, + KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; /// Return maximum block size for maximum occupancy @@ -3220,7 +3223,7 @@ private: KernelArgsTy KernelArgs = {}; uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; if (auto Err = AMDGPUKernel.launchImpl( - *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, + *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs, KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; @@ -3755,6 +3758,7 @@ private: Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], + uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { @@ -3767,13 +3771,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs)) return Err; - // Account for user requested dynamic shared memory. - uint32_t GroupSize = getGroupSize(); - if (uint32_t MaxDynCGroupMem = std::max( - KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) { - GroupSize += MaxDynCGroupMem; - } - uint64_t StackSize; if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) return Err; @@ -3825,9 +3822,17 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, KernelArgs.DynCGroupMem); } + // Increase to the requested dynamic memory size for the device if needed. + DynBlockMemSize = + std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize()); + + // HSA requires the group segment size to include both static and dynamic. + uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize; + // Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, - GroupSize, StackSize, ArgsMemoryManager); + TotalBlockMemSize, StackSize, + ArgsMemoryManager); } Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 83d20c620b96..5ed3b57704da 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -301,6 +301,18 @@ private: } }; +/// Configuration of dynamic block memory needed for launching a kernel. +struct DynBlockMemConfTy { + /// The size of the dynamic block memory buffer. + uint32_t Size = 0; + /// The size of dynamic shared memory natively provided by the device. + uint32_t NativeSize = 0; + /// The fallback that was triggered (if any). + DynCGroupMemFallbackType Fallback = DynCGroupMemFallbackType::None; + /// The fallback pointer if global memory was used as alternative. + void *FallbackPtr = nullptr; +}; + /// Class wrapping a __tgt_device_image and its offload entry table on a /// specific device. This class is responsible for storing and managing /// the offload entries for an image on a device. @@ -363,7 +375,7 @@ struct GenericKernelTy { AsyncInfoWrapperTy &AsyncInfoWrapper) const; virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], - KernelArgsTy &KernelArgs, + uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; @@ -373,6 +385,9 @@ struct GenericKernelTy { /// Get the kernel name. const char *getName() const { return Name.c_str(); } + /// Get the size of the static per-block memory consumed by the kernel. + uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; }; + /// Get the kernel image. DeviceImageTy &getImage() const { assert(ImagePtr && "Kernel is not initialized!"); @@ -386,8 +401,10 @@ struct GenericKernelTy { /// Return a device pointer to a new kernel launch environment. Expected - getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version, - AsyncInfoWrapperTy &AsyncInfo) const; + getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, + const KernelArgsTy &KernelArgs, + const DynBlockMemConfTy &DynBlockMemConf, + AsyncInfoWrapperTy &AsyncInfoWrapper) const; /// Indicate whether an execution mode is valid. static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { @@ -433,6 +450,12 @@ protected: uint32_t NumBlocks[3]) const; private: + /// Prepare the block memory buffer requested for the kernel and execute the + /// specified fallback if necessary. + Expected prepareBlockMemory(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumBlocks) const; + /// Prepare the arguments before launching the kernel. KernelLaunchParamsTy prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, @@ -487,6 +510,9 @@ protected: /// The maximum number of threads which the kernel could leverage. uint32_t MaxNumThreads; + /// The static memory sized per block. + uint32_t StaticBlockMemSize = 0; + /// The kernel environment, including execution flags. KernelEnvironmentTy KernelEnvironment; @@ -1498,6 +1524,9 @@ public: /// Query the current state of an asynchronous queue. int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr); + /// Obtain information about the given device. + InfoTreeNode obtain_device_info(int32_t DeviceId); + /// Prints information about the given devices supported by the plugin. void print_device_info(int32_t DeviceId); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 21ba9db292c4..a9af92826e63 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -435,20 +435,21 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, Expected GenericKernelTy::getKernelLaunchEnvironment( - GenericDeviceTy &GenericDevice, uint32_t Version, + GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs, + const DynBlockMemConfTy &DynBlockMemConf, AsyncInfoWrapperTy &AsyncInfoWrapper) const { // Ctor/Dtor have no arguments, replaying uses the original kernel launch // environment. Older versions of the compiler do not generate a kernel // launch environment. if (GenericDevice.Plugin.getRecordReplay().isReplaying() || - Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) + KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) return nullptr; - if (!KernelEnvironment.Configuration.ReductionDataSize || - !KernelEnvironment.Configuration.ReductionBufferLength) + if ((!KernelEnvironment.Configuration.ReductionDataSize || + !KernelEnvironment.Configuration.ReductionBufferLength) && + KernelArgs.DynCGroupMem == 0) return reinterpret_cast(~0); - // TODO: Check if the kernel needs a launch environment. auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy), /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE); @@ -462,7 +463,14 @@ GenericKernelTy::getKernelLaunchEnvironment( /// async data transfer. auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment; LocalKLE = KernelLaunchEnvironment; - { + + LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size; + LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr; + LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback; + LocalKLE.ReductionBuffer = nullptr; + + if (KernelEnvironment.Configuration.ReductionDataSize && + KernelEnvironment.Configuration.ReductionBufferLength) { auto AllocOrErr = GenericDevice.dataAlloc( KernelEnvironment.Configuration.ReductionDataSize * KernelEnvironment.Configuration.ReductionBufferLength, @@ -508,14 +516,81 @@ Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, return Plugin::success(); } +Expected +GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice, + KernelArgsTy &KernelArgs, + uint32_t NumBlocks) const { + uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize(); + uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem; + uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize; + uint32_t DynNativeBlockMemSize = DynBlockMemSize; + void *DynFallbackPtr = nullptr; + + // No enough block memory to cover the static one. Cannot run the kernel. + if (StaticBlockMemSize > MaxBlockMemSize) + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "Static block memory size exceeds maximum"); + // No enough block memory to cover dynamic one, and the fallback is aborting. + if (static_cast( + KernelArgs.Flags.DynCGroupMemFallback) == + DynCGroupMemFallbackType::Abort && + TotalBlockMemSize > MaxBlockMemSize) + return Plugin::error( + ErrorCode::INVALID_ARGUMENT, + "Requested block memory size (static + dynamic) exceeds maximum"); + + DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None; + if (DynBlockMemSize && TotalBlockMemSize > MaxBlockMemSize) { + // Launch without native dynamic block memory. + DynNativeBlockMemSize = 0; + DynFallback = static_cast( + KernelArgs.Flags.DynCGroupMemFallback); + if (DynFallback != DynCGroupMemFallbackType::DefaultMem) { + // Do not provide any memory as fallback. + DynBlockMemSize = 0; + } else { + // Get global memory as fallback. + auto AllocOrErr = GenericDevice.dataAlloc( + NumBlocks * DynBlockMemSize, + /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE); + if (!AllocOrErr) + return AllocOrErr.takeError(); + DynFallbackPtr = *AllocOrErr; + } + } + return DynBlockMemConfTy{DynBlockMemSize, DynNativeBlockMemSize, DynFallback, + DynFallbackPtr}; +} + Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, AsyncInfoWrapperTy &AsyncInfoWrapper) const { llvm::SmallVector Args; llvm::SmallVector Ptrs; + uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0], + KernelArgs.ThreadLimit[1], + KernelArgs.ThreadLimit[2]}; + uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1], + KernelArgs.NumTeams[2]}; + if (!isBareMode()) { + NumThreads[0] = getNumThreads(GenericDevice, NumThreads); + NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount, + NumThreads[0], KernelArgs.ThreadLimit[0] > 0); + } + + auto DynBlockMemConfOrErr = + prepareBlockMemory(GenericDevice, KernelArgs, NumBlocks[0]); + if (!DynBlockMemConfOrErr) + return DynBlockMemConfOrErr.takeError(); + + DynBlockMemConfTy &DynBlockMemConf = *DynBlockMemConfOrErr; + if (DynBlockMemConf.FallbackPtr) + AsyncInfoWrapper.freeAllocationAfterSynchronization( + DynBlockMemConf.FallbackPtr); + auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment( - GenericDevice, KernelArgs.Version, AsyncInfoWrapper); + GenericDevice, KernelArgs, DynBlockMemConf, AsyncInfoWrapper); if (!KernelLaunchEnvOrErr) return KernelLaunchEnvOrErr.takeError(); @@ -531,17 +606,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, Args, Ptrs, *KernelLaunchEnvOrErr); } - uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0], - KernelArgs.ThreadLimit[1], - KernelArgs.ThreadLimit[2]}; - uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1], - KernelArgs.NumTeams[2]}; - if (!isBareMode()) { - NumThreads[0] = getNumThreads(GenericDevice, NumThreads); - NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount, - NumThreads[0], KernelArgs.ThreadLimit[0] > 0); - } - // Record the kernel description after we modified the argument count and num // blocks/threads. RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay(); @@ -557,8 +621,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks)) return Err; - return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs, - LaunchParams, AsyncInfoWrapper); + return launchImpl(GenericDevice, NumThreads, NumBlocks, + DynBlockMemConf.NativeSize, KernelArgs, LaunchParams, + AsyncInfoWrapper); } KernelLaunchParamsTy GenericKernelTy::prepareArgs( @@ -1954,6 +2019,16 @@ int32_t GenericPluginTy::query_async(int32_t DeviceId, return OFFLOAD_SUCCESS; } +InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) { + auto InfoOrErr = getDevice(DeviceId).obtainInfo(); + if (auto Err = InfoOrErr.takeError()) { + REPORT() << "Failure to obtain device " << DeviceId + << " info: " << toString(std::move(Err)); + return InfoTreeNode{}; + } + return std::move(*InfoOrErr); +} + void GenericPluginTy::print_device_info(int32_t DeviceId) { if (auto Err = getDevice(DeviceId).printInfo()) REPORT() << "Failure to print device " << DeviceId diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index 7e42c66dddab..fa4f4634ecec 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -261,6 +261,7 @@ typedef enum CUdevice_attribute_enum { typedef enum CUfunction_attribute_enum { CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, } CUfunction_attribute; diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index c50c70a4456f..c96cf3d89d3d 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -150,14 +150,23 @@ struct CUDAKernelTy : public GenericKernelTy { // The maximum number of threads cannot exceed the maximum of the kernel. MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); + int SharedMemSize; + Res = cuFuncGetAttribute(&SharedMemSize, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func); + if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s")) + return Err; + + // Set the static block memory size required by the kernel. + StaticBlockMemSize = SharedMemSize; + // Retrieve the size of the arguments. return initArgsSize(); } /// Launch the CUDA kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], - uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, - KernelLaunchParamsTy LaunchParams, + uint32_t NumBlocks[3], uint32_t DynBlockMemSize, + KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; /// Return maximum block size for maximum occupancy @@ -197,7 +206,7 @@ private: CUfunction Func; /// The maximum amount of dynamic shared memory per thread group. By default, /// this is set to 48 KB. - mutable uint32_t MaxDynCGroupMemLimit = 49152; + mutable uint32_t MaxDynBlockMemSize = 49152; /// The size of the kernel arguments. size_t ArgsSize; @@ -1411,7 +1420,7 @@ private: KernelArgsTy KernelArgs = {}; uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; if (auto Err = CUDAKernel.launchImpl( - *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, + *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs, KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; @@ -1455,6 +1464,7 @@ private: Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], + uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { @@ -1470,9 +1480,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream)) return Err; - uint32_t MaxDynCGroupMem = - std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize()); - size_t ConfigArgsSize = ArgsSize; void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data, CU_LAUNCH_PARAM_BUFFER_SIZE, @@ -1484,20 +1491,24 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (GenericDevice.getRPCServer()) GenericDevice.Plugin.getRPCServer().Thread->notify(); + // Increase to the requested dynamic memory size for the device if needed. + DynBlockMemSize = + std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize()); + // In case we require more memory than the current limit. - if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) { + if (DynBlockMemSize >= MaxDynBlockMemSize) { CUresult AttrResult = cuFuncSetAttribute( - Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem); + Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize); if (auto Err = Plugin::check( AttrResult, "error in cuFuncSetAttribute while setting the memory limits: %s")) return Err; - MaxDynCGroupMemLimit = MaxDynCGroupMem; + MaxDynBlockMemSize = DynBlockMemSize; } CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0], NumThreads[1], NumThreads[2], - MaxDynCGroupMem, Stream, nullptr, Config); + DynBlockMemSize, Stream, nullptr, Config); // Register a callback to indicate when the kernel is complete. if (GenericDevice.getRPCServer()) diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index df2b6f2c1dba..d1c9af92a9fb 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -97,8 +97,8 @@ struct GenELF64KernelTy : public GenericKernelTy { /// Launch the kernel using the libffi. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], - uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, - KernelLaunchParamsTy LaunchParams, + uint32_t NumBlocks[3], uint32_t DynBlockMemSize, + KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override { if (!SupportsFFI) return Plugin::error(ErrorCode::UNSUPPORTED, diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h index 1d5a014d9d0a..50cdbd8390a9 100644 --- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h +++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h @@ -124,8 +124,8 @@ public: Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override; /// Launch the L0 kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], - uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, - KernelLaunchParamsTy LaunchParams, + uint32_t NumBlocks[3], uint32_t DynBlockMemSize, + KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; Error deinit() { CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel); diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp index b608e6ffe793..1bffbbcd2fe9 100644 --- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp +++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp @@ -413,9 +413,13 @@ Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device, Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], - KernelArgsTy &KernelArgs, + uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { + if (DynBlockMemSize > 0) + return Plugin::error(ErrorCode::UNSUPPORTED, + "dynamic shared memory is unsupported in L0 plugin"); + auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice); __tgt_async_info *AsyncInfo = AsyncInfoWrapper; diff --git a/offload/test/offloading/dyn_groupprivate.cpp b/offload/test/offloading/dyn_groupprivate.cpp new file mode 100644 index 000000000000..fd0c3de0c8c5 --- /dev/null +++ b/offload/test/offloading/dyn_groupprivate.cpp @@ -0,0 +1,199 @@ +// RUN: %libomptarget-compilexx-generic -fopenmp-version=61 +// RUN: %libomptarget-run-generic | %fcheck-generic +// RUN: %libomptarget-compileoptxx-generic -fopenmp-version=61 +// RUN: %libomptarget-run-generic | %fcheck-generic +// REQUIRES: gpu +// UNSUPPORTED: intelgpu + +#include +#include + +#define N 512 + +int main() { + int Result[N], NumThreads; + +// Verify the groupprivate buffer works as expected. +#pragma omp target teams num_teams(1) thread_limit(N) \ + dyn_groupprivate(fallback(abort) : N * sizeof(Result[0])) \ + map(from : Result, NumThreads) + { + int Buffer[N]; +#pragma omp parallel + { + int *DynBuffer = (int *)omp_get_dyn_gprivate_nofb_ptr(); + int TId = omp_get_thread_num(); + if (TId == 0) + NumThreads = omp_get_num_threads(); + Buffer[TId] = 7; + DynBuffer[TId] = 3; +#pragma omp barrier + int WrappedTId = (TId + 37) % NumThreads; + Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId]; + } + } + + if (NumThreads < N / 2 || NumThreads > N) { + printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N, + NumThreads); + return -1; + } + + int Failed = 0; + for (int i = 0; i < NumThreads; ++i) { + if (Result[i] != 7 + 3) { + printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3); + ++Failed; + } + } + + // Verify that the routines in the host returns NULL and zero. + if (omp_get_dyn_gprivate_ptr()) + ++Failed; + if (omp_get_dyn_gprivate_nofb_ptr()) + ++Failed; + if (omp_get_dyn_gprivate_size()) + ++Failed; + + size_t MaxSize = omp_get_gprivate_limit(0, omp_access_cgroup); + size_t ExceededSize = MaxSize + 10; + +// Verify that the fallback(default_mem) modifier works. +#pragma omp target dyn_groupprivate(fallback(default_mem) : ExceededSize) \ + map(tofrom : Failed) + { + if (!omp_get_dyn_gprivate_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (!omp_get_dyn_gprivate_size()) + ++Failed; + if (omp_get_dyn_gprivate_size() != ExceededSize) + ++Failed; + if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space) + ++Failed; + } + +// Verify that the fallback(null) modifier works. +#pragma omp target dyn_groupprivate(fallback(null) : ExceededSize) \ + map(tofrom : Failed) + { + if (omp_get_dyn_gprivate_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_size()) + ++Failed; + if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space) + ++Failed; + } + +// Verify that the default modifier is fallback(default_mem). +#pragma omp target dyn_groupprivate(ExceededSize) + { + if (!omp_get_dyn_gprivate_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (!omp_get_dyn_gprivate_size()) + ++Failed; + if (omp_get_dyn_gprivate_size() != ExceededSize) + ++Failed; + if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space) + ++Failed; + } + +// Verify that the fallback(abort) modifier works. +#pragma omp target dyn_groupprivate(fallback(abort) : N) map(tofrom : Failed) + { + if (!omp_get_dyn_gprivate_ptr(0)) + ++Failed; + if (!omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(5) != omp_get_dyn_gprivate_nofb_ptr(5)) + ++Failed; + if (!omp_get_dyn_gprivate_size()) + ++Failed; + if (omp_get_dyn_gprivate_size() != N) + ++Failed; + if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space) + ++Failed; + } + +// Verify that the fallback(default_mem) does not trigger when not needed. +#pragma omp target dyn_groupprivate(fallback(default_mem) : N) \ + map(tofrom : Failed) + { + if (!omp_get_dyn_gprivate_ptr(0)) + ++Failed; + if (!omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (!omp_get_dyn_gprivate_size()) + ++Failed; + if (omp_get_dyn_gprivate_size() != N) + ++Failed; + if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space) + ++Failed; + } + +// Verify that the clause works when passing a zero size. +#pragma omp target dyn_groupprivate(fallback(abort) : 0) map(tofrom : Failed) + { + if (omp_get_dyn_gprivate_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_size()) + ++Failed; + if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space) + ++Failed; + } + +// Verify that the clause works when passing a zero size and +// fallback(default_mem). +#pragma omp target dyn_groupprivate(fallback(default_mem) : 0) \ + map(tofrom : Failed) + { + if (omp_get_dyn_gprivate_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_size()) + ++Failed; + if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space) + ++Failed; + } + +// Verify that omitting the clause is the same as setting zero size. +#pragma omp target map(tofrom : Failed) + { + if (omp_get_dyn_gprivate_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0)) + ++Failed; + if (omp_get_dyn_gprivate_size()) + ++Failed; + if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space) + ++Failed; + } + + // CHECK: PASS + if (!Failed) + printf("PASS\n"); +} diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h index fab6dbde5260..57fb945b5a64 100644 --- a/openmp/device/include/DeviceTypes.h +++ b/openmp/device/include/DeviceTypes.h @@ -171,9 +171,29 @@ typedef enum omp_allocator_handle_t { KMP_ALLOCATOR_MAX_HANDLE = ~(0LU) } omp_allocator_handle_t; +typedef enum omp_memspace_handle_t { + omp_null_mem_space = 0, + omp_default_mem_space = 99, + omp_large_cap_mem_space = 1, + omp_const_mem_space = 2, + omp_high_bw_mem_space = 3, + omp_low_lat_mem_space = 4, + omp_cgroup_mem_space = 5, + KMP_MEMSPACE_MAX_HANDLE = ~(0LU) +} omp_memspace_handle_t; + #define __PRAGMA(STR) _Pragma(#STR) #define OMP_PRAGMA(STR) __PRAGMA(omp STR) ///} +/// The OpenMP access group type. The criterion for grupping tasks using a +/// specific grouping property. +enum omp_access_t { + /// Groups the tasks based on the contention group to which they belong. + omp_access_cgroup = 0, + /// Groups the tasks based on the parallel region to which they bind. + omp_access_pteam = 1, +}; + #endif diff --git a/openmp/device/include/Interface.h b/openmp/device/include/Interface.h index 71c3b1fc06d4..6a33ea2432c8 100644 --- a/openmp/device/include/Interface.h +++ b/openmp/device/include/Interface.h @@ -226,7 +226,7 @@ struct KernelEnvironmentTy; int8_t __kmpc_is_spmd_exec_mode(); int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); + KernelLaunchEnvironmentTy *KernelLaunchEnvironment); void __kmpc_target_deinit(); diff --git a/openmp/device/include/State.h b/openmp/device/include/State.h index 31dc1540d7dd..d3cd3d981e29 100644 --- a/openmp/device/include/State.h +++ b/openmp/device/include/State.h @@ -116,7 +116,7 @@ extern Local ThreadStates; /// Initialize the state machinery. Must be called by all threads. void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); + KernelLaunchEnvironmentTy *KernelLaunchEnvironment); /// Return the kernel and kernel launch environment associated with the current /// kernel. The former is static and contains compile time information that diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp index 05af35d242ac..a180df7b982e 100644 --- a/openmp/device/src/Kernel.cpp +++ b/openmp/device/src/Kernel.cpp @@ -35,8 +35,8 @@ enum OMPTgtExecModeFlags : unsigned char { }; static void -inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { +initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy *KernelLaunchEnvironment) { // Order is important here. synchronize::init(IsSPMD); mapping::init(IsSPMD); @@ -80,17 +80,17 @@ extern "C" { /// \param Ident Source location identification, can be NULL. /// int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { + KernelLaunchEnvironmentTy *KernelLaunchEnvironment) { ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; bool UseGenericStateMachine = Configuration.UseGenericStateMachine; if (IsSPMD) { - inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment, - KernelLaunchEnvironment); + initializeRuntime(/*IsSPMD=*/true, KernelEnvironment, + KernelLaunchEnvironment); synchronize::threadsAligned(atomic::relaxed); } else { - inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment, - KernelLaunchEnvironment); + initializeRuntime(/*IsSPMD=*/false, KernelEnvironment, + KernelLaunchEnvironment); // No need to wait since only the main threads will execute user // code and workers will run into a barrier right away. } diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp index 985e6b169137..243af1f2cb5e 100644 --- a/openmp/device/src/State.cpp +++ b/openmp/device/src/State.cpp @@ -40,6 +40,10 @@ using namespace ompx; [[clang::loader_uninitialized]] static Local KernelLaunchEnvironmentPtr; +/// The pointer type for dynamic shared memory. This is important to keep +/// the alignment and address space information. +using SharedMemPtrTy = decltype(&DynamicSharedBuffer[0]); + ///} namespace { @@ -138,6 +142,60 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); } +/// Manager of the dynamic cgroup memory buffer. +struct DynCGroupMemTy { + /// Initialize the manager with the information from the kernel launch + /// enviornment and the pointer to the native shared memory buffer. + void init(KernelLaunchEnvironmentTy *KLE, SharedMemPtrTy NativePtr) { + // Initialize default values. + NativeOrNullPtr = nullptr; + FallbackPtr = nullptr; + Size = 0; + Fallback = DynCGroupMemFallbackType::None; + if (!KLE) + return; + + // Initialize values using the kernel launch environment. + Size = KLE->DynCGroupMemSize; + Fallback = KLE->DynCGroupMemFb; + if (Size && Fallback == DynCGroupMemFallbackType::None) + NativeOrNullPtr = NativePtr; + if (Fallback == DynCGroupMemFallbackType::DefaultMem) + FallbackPtr = static_cast(KLE->DynCGroupMemFbPtr) + + Size * mapping::getBlockIdInKernel(); + } + + /// Get the memory space of the buffer. + omp_memspace_handle_t getMemSpace() const { + if (Size == 0) + return omp_null_mem_space; + if (Fallback == DynCGroupMemFallbackType::None) + return omp_cgroup_mem_space; + return omp_default_mem_space; + } + + /// Get the size of the buffer. + size_t getSize() const { return Size; } + + /// Get the native pointer or null if it was a fallback. + SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; } + + /// Get the native pointer or the fallback pointer. + unsigned char *getNativeOrFallbackPtr() const { + return (Fallback == DynCGroupMemFallbackType::DefaultMem) + ? FallbackPtr + : getNativeOrNullPtr(); + } + +private: + SharedMemPtrTy NativeOrNullPtr; + unsigned char *FallbackPtr; + size_t Size; + DynCGroupMemFallbackType Fallback; +}; + +[[clang::loader_uninitialized]] static Local DynCGroupMem; + } // namespace void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } @@ -226,13 +284,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, } // namespace void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { + KernelLaunchEnvironmentTy *KLE) { SharedMemorySmartStack.init(IsSPMD); + + if (KLE == reinterpret_cast(~0)) + KLE = nullptr; + if (mapping::isInitialThreadInLevel0(IsSPMD)) { + DynCGroupMem.init(KLE, DynamicSharedBuffer); TeamState.init(IsSPMD); ThreadStates = nullptr; KernelEnvironmentPtr = &KernelEnvironment; - KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; + KernelLaunchEnvironmentPtr = KLE; } } @@ -416,6 +479,25 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); } int omp_get_initial_device(void) { return -1; } int omp_is_initial_device(void) { return 0; } + +void *omp_get_dyn_gprivate_ptr(size_t Offset, omp_access_t) { + return DynCGroupMem.getNativeOrFallbackPtr() + Offset; +} + +void *omp_get_dyn_gprivate_nofb_ptr(size_t Offset, omp_access_t) { + unsigned char *Ptr = DynCGroupMem.getNativeOrNullPtr(); + // Ensure the alignment and address space information is kept. + Ptr = (unsigned char *)__builtin_assume_aligned(Ptr, allocator::ALIGNMENT); + return (SharedMemPtrTy)(Ptr + Offset); +} + +size_t omp_get_dyn_gprivate_size(omp_access_t) { + return DynCGroupMem.getSize(); +} + +omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t) { + return DynCGroupMem.getMemSpace(); +} } extern "C" { diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports index 00becd1a657f..8a70f8bc6d20 100644 --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -607,6 +607,7 @@ kmp_set_disp_num_buffers 890 llvm_omp_target_shared_mem_space DATA llvm_omp_target_device_mem_space DATA omp_null_mem_space DATA + omp_cgroup_mem_space DATA %ifndef stub # Ordinals between 900 and 999 are reserved diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var index e98df731ad88..be309727ba09 100644 --- a/openmp/runtime/src/include/omp.h.var +++ b/openmp/runtime/src/include/omp.h.var @@ -380,6 +380,11 @@ omp_uintptr_t value; } omp_alloctrait_t; + typedef enum { + omp_access_cgroup = 0, + omp_access_pteam = 1 + } omp_access_t; + # if defined(_WIN32) // On Windows cl and icl do not support 64-bit enum, let's use integer then. typedef omp_uintptr_t omp_allocator_handle_t; @@ -403,6 +408,7 @@ extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space; + extern __KMP_IMP omp_memspace_handle_t const omp_cgroup_mem_space; extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space; extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space; extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space; @@ -439,6 +445,7 @@ omp_const_mem_space = 2, omp_high_bw_mem_space = 3, omp_low_lat_mem_space = 4, + omp_cgroup_mem_space = 5, llvm_omp_target_host_mem_space = 100, llvm_omp_target_shared_mem_space = 101, llvm_omp_target_device_mem_space = 102, @@ -463,6 +470,11 @@ omp_allocator_handle_t allocator = omp_null_allocator, omp_allocator_handle_t free_allocator = omp_null_allocator); extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator); + extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup); + extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup); + extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group = omp_access_cgroup); + extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group = omp_access_cgroup); + extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup); # else extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a); extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size, @@ -473,6 +485,11 @@ extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator, omp_allocator_handle_t free_allocator); extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a); + extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group); + extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group); + extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group); + extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group); + extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group); # endif /* OpenMP TR11 routines to get memory spaces and allocators */ diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 36c40abaf1ef..19deaef75415 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1072,6 +1072,7 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space; extern omp_memspace_handle_t const omp_const_mem_space; extern omp_memspace_handle_t const omp_high_bw_mem_space; extern omp_memspace_handle_t const omp_low_lat_mem_space; +extern omp_memspace_handle_t const omp_cgroup_mem_space; extern omp_memspace_handle_t const llvm_omp_target_host_mem_space; extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space; extern omp_memspace_handle_t const llvm_omp_target_device_mem_space; diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index a92fc46374c2..8aa9a9caa924 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -4515,6 +4515,20 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) { } /* end of OpenMP 5.1 Memory Management routines */ +void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) { + return NULL; +} + +void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) { + return NULL; +} + +size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { return 0; } + +omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) { + return omp_null_mem_space; +} + int __kmpc_get_target_offload(void) { if (!__kmp_init_serial) { __kmp_serial_initialize(); diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 6c3b576cab40..c6fdcf824af9 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -333,6 +333,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space = (omp_memspace_handle_t const)3; omp_memspace_handle_t const omp_low_lat_mem_space = (omp_memspace_handle_t const)4; +omp_memspace_handle_t const omp_cgroup_mem_space = + (omp_memspace_handle_t const)5; omp_memspace_handle_t const llvm_omp_target_host_mem_space = (omp_memspace_handle_t const)100; omp_memspace_handle_t const llvm_omp_target_shared_mem_space = diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp index 06276d1bed1c..4c1e6099574a 100644 --- a/openmp/runtime/src/kmp_stub.cpp +++ b/openmp/runtime/src/kmp_stub.cpp @@ -368,6 +368,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space = (omp_memspace_handle_t const)3; omp_memspace_handle_t const omp_low_lat_mem_space = (omp_memspace_handle_t const)4; +omp_memspace_handle_t const omp_cgroup_mem_space = + (omp_memspace_handle_t const)5; omp_memspace_handle_t const llvm_omp_target_host_mem_space = (omp_memspace_handle_t const)100; omp_memspace_handle_t const llvm_omp_target_shared_mem_space = @@ -454,6 +456,31 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) { #endif } +void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) { + i; + return NULL; +} + +void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) { + i; + return NULL; +} + +size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { + i; + return 0; +} + +omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) { + i; + return omp_null_mem_space; +} + +size_t omp_get_gprivate_limit(int device_num, omp_access_t access_group) { + i; + return 0; +} + /* OpenMP 5.0 Affinity Format */ void omp_set_affinity_format(char const *format) { i; } size_t omp_get_affinity_format(char *buffer, size_t size) {