From ac71b185c2fd6b23e107a29f630d3a927ab6b3b0 Mon Sep 17 00:00:00 2001 From: Kevin Sala Penades Date: Thu, 12 Mar 2026 21:21:29 -0700 Subject: [PATCH] [offload] Remove LIBOMPTARGET_SHARED_MEMORY_SIZE envar (#186231) This commit removes the `LIBOMPTARGET_SHARED_MEMORY_SIZE` envar and outputs a runtime warning if it is defined. Access to dynamic shared memory should be obtained through the `dyn_groupprivate` clause (OpenMP 6.1) or the launch arguments in liboffload kernel launch. --- offload/libomptarget/OffloadRTL.cpp | 11 ++++ offload/libomptarget/device.cpp | 2 +- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 4 -- .../common/include/PluginInterface.h | 2 - .../common/src/PluginInterface.cpp | 1 - offload/plugins-nextgen/cuda/src/rtl.cpp | 4 -- offload/test/api/omp_dynamic_shared_memory.c | 31 ---------- .../api/omp_dynamic_shared_memory_amdgpu.c | 26 -------- openmp/docs/design/Runtimes.rst | 59 ++++++------------- 9 files changed, 29 insertions(+), 111 deletions(-) delete mode 100644 offload/test/api/omp_dynamic_shared_memory.c delete mode 100644 offload/test/api/omp_dynamic_shared_memory_amdgpu.c diff --git a/offload/libomptarget/OffloadRTL.cpp b/offload/libomptarget/OffloadRTL.cpp index 3a18d76aaae1..9b02376609ce 100644 --- a/offload/libomptarget/OffloadRTL.cpp +++ b/offload/libomptarget/OffloadRTL.cpp @@ -26,11 +26,22 @@ static uint32_t RefCount = 0; std::atomic RTLAlive{false}; std::atomic RTLOngoingSyncs{0}; +/// Check deleted and deprecated features, such as environment variables. +static void checkRuntimeEnvironment() { + const char *ShmemEnvarName = "LIBOMPTARGET_SHARED_MEMORY_SIZE"; + if (std::getenv(ShmemEnvarName)) + MESSAGE("Warning: %s is no longer valid. Please use OpenMP clause " + "'dyn_groupprivate' instead.\n", + ShmemEnvarName); +} + void initRuntime() { std::scoped_lock Lock(PluginMtx); Profiler::get(); TIMESCOPE(); + checkRuntimeEnvironment(); + if (PM == nullptr) PM = new PluginManager(); diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index 659ef689f67e..12c15aea1ad6 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -211,7 +211,7 @@ DeviceTy::loadBinary(__tgt_device_image *Img) { DeviceEnvironment.NumDevices = RTL->getNumDevices(); // TODO: The device ID used here is not the real device ID used by OpenMP. DeviceEnvironment.DeviceNum = RTLDeviceID; - DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize(); + DeviceEnvironment.DynamicMemSize = 0; DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency(); DeviceEnvironment.IndirectCallTable = reinterpret_cast(CallTablePairOrErr->first); diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 37d7c6345f02..b25d3c9ab721 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3822,10 +3822,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, KernelArgs.DynCGroupMem); } - // Increase to the requested dynamic memory size for the device if needed. - DynBlockMemSize = - std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize()); - // HSA requires the group segment size to include both static and dynamic. uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize; diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index bd03756a07de..b6a54f05b1dc 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -1023,7 +1023,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { return GridValues.GV_Default_Num_Teams; } uint32_t getDebugKind() const { return OMPX_DebugKind; } - uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } /// Get target compute unit kind (e.g., sm_80, or gfx908). @@ -1196,7 +1195,6 @@ private: /// Environment variables defined by the LLVM OpenMP implementation. Int32Envar OMPX_DebugKind; - UInt32Envar OMPX_SharedMemorySize; UInt64Envar OMPX_TargetStackSize; UInt64Envar OMPX_TargetHeapSize; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 1d2f5be02c98..4093d08044bc 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -776,7 +776,6 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, OMP_NumTeams("OMP_NUM_TEAMS"), OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"), OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"), - OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"), // Do not initialize the following two envars since they depend on the // device initialization. These cannot be consulted until the device is // initialized correctly. We initialize them in GenericDeviceTy::init(). diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index c96cf3d89d3d..4de754265ea7 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1491,10 +1491,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (GenericDevice.getRPCServer()) GenericDevice.Plugin.getRPCServer().Thread->notify(); - // Increase to the requested dynamic memory size for the device if needed. - DynBlockMemSize = - std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize()); - // In case we require more memory than the current limit. if (DynBlockMemSize >= MaxDynBlockMemSize) { CUresult AttrResult = cuFuncSetAttribute( diff --git a/offload/test/api/omp_dynamic_shared_memory.c b/offload/test/api/omp_dynamic_shared_memory.c deleted file mode 100644 index e12b97d34edb..000000000000 --- a/offload/test/api/omp_dynamic_shared_memory.c +++ /dev/null @@ -1,31 +0,0 @@ -// RUN: %libomptarget-compile-generic -// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \ -// RUN: %libomptarget-run-generic | %fcheck-generic - -// RUN: %libomptarget-compileopt-generic -// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \ -// RUN: %libomptarget-run-generic | %fcheck-generic - -// REQUIRES: gpu -// XFAIL: intelgpu - -#include -#include - -int main() { - int x; -#pragma omp target parallel map(from : x) - { - int *buf = llvm_omp_target_dynamic_shared_alloc() + 252; -#pragma omp barrier - if (omp_get_thread_num() == 0) - *buf = 1; -#pragma omp barrier - if (omp_get_thread_num() == 1) - x = *buf; - } - - // CHECK: PASS - if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL) - printf("PASS\n"); -} diff --git a/offload/test/api/omp_dynamic_shared_memory_amdgpu.c b/offload/test/api/omp_dynamic_shared_memory_amdgpu.c deleted file mode 100644 index 1aaec06659a7..000000000000 --- a/offload/test/api/omp_dynamic_shared_memory_amdgpu.c +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mllvm \ -// RUN: -openmp-opt-inline-device -// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \ -// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa -// REQUIRES: amdgcn-amd-amdhsa - -#include -#include - -int main() { - int x; -#pragma omp target parallel map(from : x) - { - int *buf = llvm_omp_target_dynamic_shared_alloc() + 252; -#pragma omp barrier - if (omp_get_thread_num() == 0) - *buf = 1; -#pragma omp barrier - if (omp_get_thread_num() == 1) - x = *buf; - } - - // CHECK: PASS - if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL) - printf("PASS\n"); -} diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index d46ec5ba5293..ab9484f9ad0a 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -733,7 +733,6 @@ variables is defined below. * ``LIBOMPTARGET_INFO=`` * ``LIBOMPTARGET_HEAP_SIZE=`` * ``LIBOMPTARGET_STACK_SIZE=`` - * ``LIBOMPTARGET_SHARED_MEMORY_SIZE=`` * ``LIBOMPTARGET_MAP_FORCE_ATOMIC=[TRUE/FALSE] (default TRUE)`` * ``LIBOMPTARGET_TREAT_ATTACH_AUTO_AS_ALWAYS=[TRUE/FALSE] (default FALSE)`` * ``LIBOMPTARGET_JIT_OPT_LEVEL={0,1,2,3} (default 3)`` @@ -1059,14 +1058,6 @@ allocated using ``malloc`` and ``free`` for the CUDA plugin. This is necessary for some applications that allocate too much memory either through the user or globalization. -LIBOMPTARGET_SHARED_MEMORY_SIZE -""""""""""""""""""""""""""""""" - -This environment variable sets the amount of dynamic shared memory in bytes used -by the kernel once it is launched. A pointer to the dynamic memory buffer can be -accessed using the ``llvm_omp_target_dynamic_shared_alloc`` function. An example -is shown in :ref:`libomptarget_dynamic_shared`. - .. toctree:: :hidden: :maxdepth: 1 @@ -1233,7 +1224,6 @@ Environment Variables There are several environment variables to change the behavior of the plugins: -* ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` * ``LIBOMPTARGET_STACK_SIZE`` * ``LIBOMPTARGET_HEAP_SIZE`` * ``LIBOMPTARGET_NUM_INITIAL_STREAMS`` @@ -1247,8 +1237,8 @@ There are several environment variables to change the behavior of the plugins: * ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS`` * ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT`` -The environment variables ``LIBOMPTARGET_SHARED_MEMORY_SIZE``, -``LIBOMPTARGET_STACK_SIZE`` and ``LIBOMPTARGET_HEAP_SIZE`` are described in +The environment variables ``LIBOMPTARGET_STACK_SIZE`` and +``LIBOMPTARGET_HEAP_SIZE`` are described in :ref:`libopenmptarget_environment_vars`. LIBOMPTARGET_NUM_INITIAL_STREAMS @@ -1401,6 +1391,10 @@ LIBOMPTARGET_RPC_LATENCY """""""""""""""""""""""" This is the maximum amount of time the client will wait for a response from the server. +.. warning:: + The ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` environment variable is not + supported anymore. Please use the ``dyn_groupprivate`` clause instead, as + shown in :ref:`libomptarget_dynamic_shared`. .. _libomptarget_libc: @@ -1463,35 +1457,21 @@ IR during compilation. Dynamic Shared Memory ^^^^^^^^^^^^^^^^^^^^^ -The target device runtime contains a pointer to the dynamic shared memory -buffer. This pointer can be obtained using the +The OpenMP implementation provides access to dynamic shared memory in ``target`` +regions through the ``dyn_groupprivate`` clause, introduced in OpenMP 6.1. This +is the preferred method to obtain dynamic shared memory. Please refer to +the OpenMP standard documentation for more information. + +As an alternative, the target device runtime contains a pointer to the native +dynamic shared memory buffer. This pointer can be obtained using the ``llvm_omp_target_dynamic_shared_alloc`` extension. If this function is called from the host it will simply return a null pointer. In order to use this buffer the kernel must be launched with an adequate amount of dynamic shared memory -allocated. This can be done using the ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` -environment variable or the ``ompx_dyn_cgroup_mem()`` target directive -clause. Examples for both are given below. +allocated. This can be done using the ``ompx_dyn_cgroup_mem()`` target +directive clause. An example is given below. -.. code-block:: c++ - - void foo() { - int x; - #pragma omp target parallel map(from : x) - { - int *buf = llvm_omp_target_dynamic_shared_alloc(); - if (omp_get_thread_num() == 0) - *buf = 1; - #pragma omp barrier - if (omp_get_thread_num() == 1) - x = *buf; - } - assert(x == 1); - } - -.. code-block:: console - - $ clang++ -fopenmp --offload-arch=sm_80 -O3 shared.c - $ env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 ./shared +Please notice that the ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` environment variable +is not supported anymore. .. code-block:: c++ @@ -1509,11 +1489,6 @@ clause. Examples for both are given below. assert(x == 1); } -.. code-block:: console - - $ clang++ -fopenmp --offload-arch=gfx90a -O3 shared.c - $ env ./shared - .. _libomptarget_device_allocator: Device Allocation