[offload] Remove LIBOMPTARGET_SHARED_MEMORY_SIZE envar (#186231)

This commit removes the `LIBOMPTARGET_SHARED_MEMORY_SIZE` envar and
outputs a runtime warning if it is defined. Access to dynamic shared memory
should be obtained through the `dyn_groupprivate` clause (OpenMP 6.1) or
the launch arguments in liboffload kernel launch.
This commit is contained in:
Kevin Sala Penades
2026-03-12 21:21:29 -07:00
committed by GitHub
parent e51e9afe68
commit ac71b185c2
9 changed files with 29 additions and 111 deletions

View File

@@ -26,11 +26,22 @@ static uint32_t RefCount = 0;
std::atomic<bool> RTLAlive{false}; std::atomic<bool> RTLAlive{false};
std::atomic<int> RTLOngoingSyncs{0}; std::atomic<int> RTLOngoingSyncs{0};
/// Check deleted and deprecated features, such as environment variables.
static void checkRuntimeEnvironment() {
const char *ShmemEnvarName = "LIBOMPTARGET_SHARED_MEMORY_SIZE";
if (std::getenv(ShmemEnvarName))
MESSAGE("Warning: %s is no longer valid. Please use OpenMP clause "
"'dyn_groupprivate' instead.\n",
ShmemEnvarName);
}
void initRuntime() { void initRuntime() {
std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx); std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
Profiler::get(); Profiler::get();
TIMESCOPE(); TIMESCOPE();
checkRuntimeEnvironment();
if (PM == nullptr) if (PM == nullptr)
PM = new PluginManager(); PM = new PluginManager();

View File

@@ -211,7 +211,7 @@ DeviceTy::loadBinary(__tgt_device_image *Img) {
DeviceEnvironment.NumDevices = RTL->getNumDevices(); DeviceEnvironment.NumDevices = RTL->getNumDevices();
// TODO: The device ID used here is not the real device ID used by OpenMP. // TODO: The device ID used here is not the real device ID used by OpenMP.
DeviceEnvironment.DeviceNum = RTLDeviceID; DeviceEnvironment.DeviceNum = RTLDeviceID;
DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize(); DeviceEnvironment.DynamicMemSize = 0;
DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency(); DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency();
DeviceEnvironment.IndirectCallTable = DeviceEnvironment.IndirectCallTable =
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first); reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);

View File

@@ -3822,10 +3822,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem); KernelArgs.DynCGroupMem);
} }
// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
// HSA requires the group segment size to include both static and dynamic. // HSA requires the group segment size to include both static and dynamic.
uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize; uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;

View File

@@ -1023,7 +1023,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return GridValues.GV_Default_Num_Teams; return GridValues.GV_Default_Num_Teams;
} }
uint32_t getDebugKind() const { return OMPX_DebugKind; } uint32_t getDebugKind() const { return OMPX_DebugKind; }
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; } virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
/// Get target compute unit kind (e.g., sm_80, or gfx908). /// Get target compute unit kind (e.g., sm_80, or gfx908).
@@ -1196,7 +1195,6 @@ private:
/// Environment variables defined by the LLVM OpenMP implementation. /// Environment variables defined by the LLVM OpenMP implementation.
Int32Envar OMPX_DebugKind; Int32Envar OMPX_DebugKind;
UInt32Envar OMPX_SharedMemorySize;
UInt64Envar OMPX_TargetStackSize; UInt64Envar OMPX_TargetStackSize;
UInt64Envar OMPX_TargetHeapSize; UInt64Envar OMPX_TargetHeapSize;

View File

@@ -776,7 +776,6 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
OMP_NumTeams("OMP_NUM_TEAMS"), OMP_NumTeams("OMP_NUM_TEAMS"),
OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"), OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"), OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
// Do not initialize the following two envars since they depend on the // Do not initialize the following two envars since they depend on the
// device initialization. These cannot be consulted until the device is // device initialization. These cannot be consulted until the device is
// initialized correctly. We initialize them in GenericDeviceTy::init(). // initialized correctly. We initialize them in GenericDeviceTy::init().

View File

@@ -1491,10 +1491,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (GenericDevice.getRPCServer()) if (GenericDevice.getRPCServer())
GenericDevice.Plugin.getRPCServer().Thread->notify(); GenericDevice.Plugin.getRPCServer().Thread->notify();
// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
// In case we require more memory than the current limit. // In case we require more memory than the current limit.
if (DynBlockMemSize >= MaxDynBlockMemSize) { if (DynBlockMemSize >= MaxDynBlockMemSize) {
CUresult AttrResult = cuFuncSetAttribute( CUresult AttrResult = cuFuncSetAttribute(

View File

@@ -1,31 +0,0 @@
// RUN: %libomptarget-compile-generic
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
// RUN: %libomptarget-run-generic | %fcheck-generic
// RUN: %libomptarget-compileopt-generic
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
// RUN: %libomptarget-run-generic | %fcheck-generic
// REQUIRES: gpu
// XFAIL: intelgpu
#include <omp.h>
#include <stdio.h>
int main() {
int x;
#pragma omp target parallel map(from : x)
{
int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
#pragma omp barrier
if (omp_get_thread_num() == 0)
*buf = 1;
#pragma omp barrier
if (omp_get_thread_num() == 1)
x = *buf;
}
// CHECK: PASS
if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
printf("PASS\n");
}

View File

@@ -1,26 +0,0 @@
// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mllvm \
// RUN: -openmp-opt-inline-device
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
// REQUIRES: amdgcn-amd-amdhsa
#include <omp.h>
#include <stdio.h>
int main() {
int x;
#pragma omp target parallel map(from : x)
{
int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
#pragma omp barrier
if (omp_get_thread_num() == 0)
*buf = 1;
#pragma omp barrier
if (omp_get_thread_num() == 1)
x = *buf;
}
// CHECK: PASS
if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
printf("PASS\n");
}

View File

@@ -733,7 +733,6 @@ variables is defined below.
* ``LIBOMPTARGET_INFO=<Num>`` * ``LIBOMPTARGET_INFO=<Num>``
* ``LIBOMPTARGET_HEAP_SIZE=<Num>`` * ``LIBOMPTARGET_HEAP_SIZE=<Num>``
* ``LIBOMPTARGET_STACK_SIZE=<Num>`` * ``LIBOMPTARGET_STACK_SIZE=<Num>``
* ``LIBOMPTARGET_SHARED_MEMORY_SIZE=<Num>``
* ``LIBOMPTARGET_MAP_FORCE_ATOMIC=[TRUE/FALSE] (default TRUE)`` * ``LIBOMPTARGET_MAP_FORCE_ATOMIC=[TRUE/FALSE] (default TRUE)``
* ``LIBOMPTARGET_TREAT_ATTACH_AUTO_AS_ALWAYS=[TRUE/FALSE] (default FALSE)`` * ``LIBOMPTARGET_TREAT_ATTACH_AUTO_AS_ALWAYS=[TRUE/FALSE] (default FALSE)``
* ``LIBOMPTARGET_JIT_OPT_LEVEL={0,1,2,3} (default 3)`` * ``LIBOMPTARGET_JIT_OPT_LEVEL={0,1,2,3} (default 3)``
@@ -1059,14 +1058,6 @@ allocated using ``malloc`` and ``free`` for the CUDA plugin. This is necessary
for some applications that allocate too much memory either through the user or for some applications that allocate too much memory either through the user or
globalization. globalization.
LIBOMPTARGET_SHARED_MEMORY_SIZE
"""""""""""""""""""""""""""""""
This environment variable sets the amount of dynamic shared memory in bytes used
by the kernel once it is launched. A pointer to the dynamic memory buffer can be
accessed using the ``llvm_omp_target_dynamic_shared_alloc`` function. An example
is shown in :ref:`libomptarget_dynamic_shared`.
.. toctree:: .. toctree::
:hidden: :hidden:
:maxdepth: 1 :maxdepth: 1
@@ -1233,7 +1224,6 @@ Environment Variables
There are several environment variables to change the behavior of the plugins: There are several environment variables to change the behavior of the plugins:
* ``LIBOMPTARGET_SHARED_MEMORY_SIZE``
* ``LIBOMPTARGET_STACK_SIZE`` * ``LIBOMPTARGET_STACK_SIZE``
* ``LIBOMPTARGET_HEAP_SIZE`` * ``LIBOMPTARGET_HEAP_SIZE``
* ``LIBOMPTARGET_NUM_INITIAL_STREAMS`` * ``LIBOMPTARGET_NUM_INITIAL_STREAMS``
@@ -1247,8 +1237,8 @@ There are several environment variables to change the behavior of the plugins:
* ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS`` * ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS``
* ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT`` * ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT``
The environment variables ``LIBOMPTARGET_SHARED_MEMORY_SIZE``, The environment variables ``LIBOMPTARGET_STACK_SIZE`` and
``LIBOMPTARGET_STACK_SIZE`` and ``LIBOMPTARGET_HEAP_SIZE`` are described in ``LIBOMPTARGET_HEAP_SIZE`` are described in
:ref:`libopenmptarget_environment_vars`. :ref:`libopenmptarget_environment_vars`.
LIBOMPTARGET_NUM_INITIAL_STREAMS LIBOMPTARGET_NUM_INITIAL_STREAMS
@@ -1401,6 +1391,10 @@ LIBOMPTARGET_RPC_LATENCY
"""""""""""""""""""""""" """"""""""""""""""""""""
This is the maximum amount of time the client will wait for a response from the server. This is the maximum amount of time the client will wait for a response from the server.
.. warning::
The ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` environment variable is not
supported anymore. Please use the ``dyn_groupprivate`` clause instead, as
shown in :ref:`libomptarget_dynamic_shared`.
.. _libomptarget_libc: .. _libomptarget_libc:
@@ -1463,35 +1457,21 @@ IR during compilation.
Dynamic Shared Memory Dynamic Shared Memory
^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^
The target device runtime contains a pointer to the dynamic shared memory The OpenMP implementation provides access to dynamic shared memory in ``target``
buffer. This pointer can be obtained using the regions through the ``dyn_groupprivate`` clause, introduced in OpenMP 6.1. This
is the preferred method to obtain dynamic shared memory. Please refer to
the OpenMP standard documentation for more information.
As an alternative, the target device runtime contains a pointer to the native
dynamic shared memory buffer. This pointer can be obtained using the
``llvm_omp_target_dynamic_shared_alloc`` extension. If this function is called ``llvm_omp_target_dynamic_shared_alloc`` extension. If this function is called
from the host it will simply return a null pointer. In order to use this buffer from the host it will simply return a null pointer. In order to use this buffer
the kernel must be launched with an adequate amount of dynamic shared memory the kernel must be launched with an adequate amount of dynamic shared memory
allocated. This can be done using the ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` allocated. This can be done using the ``ompx_dyn_cgroup_mem(<N>)`` target
environment variable or the ``ompx_dyn_cgroup_mem(<N>)`` target directive directive clause. An example is given below.
clause. Examples for both are given below.
.. code-block:: c++ Please notice that the ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` environment variable
is not supported anymore.
void foo() {
int x;
#pragma omp target parallel map(from : x)
{
int *buf = llvm_omp_target_dynamic_shared_alloc();
if (omp_get_thread_num() == 0)
*buf = 1;
#pragma omp barrier
if (omp_get_thread_num() == 1)
x = *buf;
}
assert(x == 1);
}
.. code-block:: console
$ clang++ -fopenmp --offload-arch=sm_80 -O3 shared.c
$ env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 ./shared
.. code-block:: c++ .. code-block:: c++
@@ -1509,11 +1489,6 @@ clause. Examples for both are given below.
assert(x == 1); assert(x == 1);
} }
.. code-block:: console
$ clang++ -fopenmp --offload-arch=gfx90a -O3 shared.c
$ env ./shared
.. _libomptarget_device_allocator: .. _libomptarget_device_allocator:
Device Allocation Device Allocation