[offload] Remove LIBOMPTARGET_SHARED_MEMORY_SIZE envar (#186231)

This commit removes the `LIBOMPTARGET_SHARED_MEMORY_SIZE` envar and
outputs a runtime warning if it is defined. Access to dynamic shared memory
should be obtained through the `dyn_groupprivate` clause (OpenMP 6.1) or
the launch arguments in liboffload kernel launch.
This commit is contained in:
Kevin Sala Penades
2026-03-12 21:21:29 -07:00
committed by GitHub
parent e51e9afe68
commit ac71b185c2
9 changed files with 29 additions and 111 deletions

View File

@@ -26,11 +26,22 @@ static uint32_t RefCount = 0;
std::atomic<bool> RTLAlive{false};
std::atomic<int> RTLOngoingSyncs{0};
/// Check deleted and deprecated features, such as environment variables.
static void checkRuntimeEnvironment() {
const char *ShmemEnvarName = "LIBOMPTARGET_SHARED_MEMORY_SIZE";
if (std::getenv(ShmemEnvarName))
MESSAGE("Warning: %s is no longer valid. Please use OpenMP clause "
"'dyn_groupprivate' instead.\n",
ShmemEnvarName);
}
void initRuntime() {
std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
Profiler::get();
TIMESCOPE();
checkRuntimeEnvironment();
if (PM == nullptr)
PM = new PluginManager();

View File

@@ -211,7 +211,7 @@ DeviceTy::loadBinary(__tgt_device_image *Img) {
DeviceEnvironment.NumDevices = RTL->getNumDevices();
// TODO: The device ID used here is not the real device ID used by OpenMP.
DeviceEnvironment.DeviceNum = RTLDeviceID;
DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize();
DeviceEnvironment.DynamicMemSize = 0;
DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency();
DeviceEnvironment.IndirectCallTable =
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);

View File

@@ -3822,10 +3822,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem);
}
// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
// HSA requires the group segment size to include both static and dynamic.
uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;

View File

@@ -1023,7 +1023,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
return GridValues.GV_Default_Num_Teams;
}
uint32_t getDebugKind() const { return OMPX_DebugKind; }
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
/// Get target compute unit kind (e.g., sm_80, or gfx908).
@@ -1196,7 +1195,6 @@ private:
/// Environment variables defined by the LLVM OpenMP implementation.
Int32Envar OMPX_DebugKind;
UInt32Envar OMPX_SharedMemorySize;
UInt64Envar OMPX_TargetStackSize;
UInt64Envar OMPX_TargetHeapSize;

View File

@@ -776,7 +776,6 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
OMP_NumTeams("OMP_NUM_TEAMS"),
OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
// Do not initialize the following two envars since they depend on the
// device initialization. These cannot be consulted until the device is
// initialized correctly. We initialize them in GenericDeviceTy::init().

View File

@@ -1491,10 +1491,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (GenericDevice.getRPCServer())
GenericDevice.Plugin.getRPCServer().Thread->notify();
// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
// In case we require more memory than the current limit.
if (DynBlockMemSize >= MaxDynBlockMemSize) {
CUresult AttrResult = cuFuncSetAttribute(

View File

@@ -1,31 +0,0 @@
// RUN: %libomptarget-compile-generic
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
// RUN: %libomptarget-run-generic | %fcheck-generic
// RUN: %libomptarget-compileopt-generic
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
// RUN: %libomptarget-run-generic | %fcheck-generic
// REQUIRES: gpu
// XFAIL: intelgpu
#include <omp.h>
#include <stdio.h>
int main() {
int x;
#pragma omp target parallel map(from : x)
{
int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
#pragma omp barrier
if (omp_get_thread_num() == 0)
*buf = 1;
#pragma omp barrier
if (omp_get_thread_num() == 1)
x = *buf;
}
// CHECK: PASS
if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
printf("PASS\n");
}

View File

@@ -1,26 +0,0 @@
// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mllvm \
// RUN: -openmp-opt-inline-device
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
// REQUIRES: amdgcn-amd-amdhsa
#include <omp.h>
#include <stdio.h>
int main() {
int x;
#pragma omp target parallel map(from : x)
{
int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
#pragma omp barrier
if (omp_get_thread_num() == 0)
*buf = 1;
#pragma omp barrier
if (omp_get_thread_num() == 1)
x = *buf;
}
// CHECK: PASS
if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
printf("PASS\n");
}

View File

@@ -733,7 +733,6 @@ variables is defined below.
* ``LIBOMPTARGET_INFO=<Num>``
* ``LIBOMPTARGET_HEAP_SIZE=<Num>``
* ``LIBOMPTARGET_STACK_SIZE=<Num>``
* ``LIBOMPTARGET_SHARED_MEMORY_SIZE=<Num>``
* ``LIBOMPTARGET_MAP_FORCE_ATOMIC=[TRUE/FALSE] (default TRUE)``
* ``LIBOMPTARGET_TREAT_ATTACH_AUTO_AS_ALWAYS=[TRUE/FALSE] (default FALSE)``
* ``LIBOMPTARGET_JIT_OPT_LEVEL={0,1,2,3} (default 3)``
@@ -1059,14 +1058,6 @@ allocated using ``malloc`` and ``free`` for the CUDA plugin. This is necessary
for some applications that allocate too much memory either through the user or
globalization.
LIBOMPTARGET_SHARED_MEMORY_SIZE
"""""""""""""""""""""""""""""""
This environment variable sets the amount of dynamic shared memory in bytes used
by the kernel once it is launched. A pointer to the dynamic memory buffer can be
accessed using the ``llvm_omp_target_dynamic_shared_alloc`` function. An example
is shown in :ref:`libomptarget_dynamic_shared`.
.. toctree::
:hidden:
:maxdepth: 1
@@ -1233,7 +1224,6 @@ Environment Variables
There are several environment variables to change the behavior of the plugins:
* ``LIBOMPTARGET_SHARED_MEMORY_SIZE``
* ``LIBOMPTARGET_STACK_SIZE``
* ``LIBOMPTARGET_HEAP_SIZE``
* ``LIBOMPTARGET_NUM_INITIAL_STREAMS``
@@ -1247,8 +1237,8 @@ There are several environment variables to change the behavior of the plugins:
* ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS``
* ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT``
The environment variables ``LIBOMPTARGET_SHARED_MEMORY_SIZE``,
``LIBOMPTARGET_STACK_SIZE`` and ``LIBOMPTARGET_HEAP_SIZE`` are described in
The environment variables ``LIBOMPTARGET_STACK_SIZE`` and
``LIBOMPTARGET_HEAP_SIZE`` are described in
:ref:`libopenmptarget_environment_vars`.
LIBOMPTARGET_NUM_INITIAL_STREAMS
@@ -1401,6 +1391,10 @@ LIBOMPTARGET_RPC_LATENCY
""""""""""""""""""""""""
This is the maximum amount of time the client will wait for a response from the server.
.. warning::
The ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` environment variable is not
supported anymore. Please use the ``dyn_groupprivate`` clause instead, as
shown in :ref:`libomptarget_dynamic_shared`.
.. _libomptarget_libc:
@@ -1463,35 +1457,21 @@ IR during compilation.
Dynamic Shared Memory
^^^^^^^^^^^^^^^^^^^^^
The target device runtime contains a pointer to the dynamic shared memory
buffer. This pointer can be obtained using the
The OpenMP implementation provides access to dynamic shared memory in ``target``
regions through the ``dyn_groupprivate`` clause, introduced in OpenMP 6.1. This
is the preferred method to obtain dynamic shared memory. Please refer to
the OpenMP standard documentation for more information.
As an alternative, the target device runtime contains a pointer to the native
dynamic shared memory buffer. This pointer can be obtained using the
``llvm_omp_target_dynamic_shared_alloc`` extension. If this function is called
from the host it will simply return a null pointer. In order to use this buffer
the kernel must be launched with an adequate amount of dynamic shared memory
allocated. This can be done using the ``LIBOMPTARGET_SHARED_MEMORY_SIZE``
environment variable or the ``ompx_dyn_cgroup_mem(<N>)`` target directive
clause. Examples for both are given below.
allocated. This can be done using the ``ompx_dyn_cgroup_mem(<N>)`` target
directive clause. An example is given below.
.. code-block:: c++
void foo() {
int x;
#pragma omp target parallel map(from : x)
{
int *buf = llvm_omp_target_dynamic_shared_alloc();
if (omp_get_thread_num() == 0)
*buf = 1;
#pragma omp barrier
if (omp_get_thread_num() == 1)
x = *buf;
}
assert(x == 1);
}
.. code-block:: console
$ clang++ -fopenmp --offload-arch=sm_80 -O3 shared.c
$ env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 ./shared
Please notice that the ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` environment variable
is not supported anymore.
.. code-block:: c++
@@ -1509,11 +1489,6 @@ clause. Examples for both are given below.
assert(x == 1);
}
.. code-block:: console
$ clang++ -fopenmp --offload-arch=gfx90a -O3 shared.c
$ env ./shared
.. _libomptarget_device_allocator:
Device Allocation