[offload] Remove LIBOMPTARGET_SHARED_MEMORY_SIZE envar (#186231)
This commit removes the `LIBOMPTARGET_SHARED_MEMORY_SIZE` envar and outputs a runtime warning if it is defined. Access to dynamic shared memory should be obtained through the `dyn_groupprivate` clause (OpenMP 6.1) or the launch arguments in liboffload kernel launch.
This commit is contained in:
committed by
GitHub
parent
e51e9afe68
commit
ac71b185c2
@@ -26,11 +26,22 @@ static uint32_t RefCount = 0;
|
|||||||
std::atomic<bool> RTLAlive{false};
|
std::atomic<bool> RTLAlive{false};
|
||||||
std::atomic<int> RTLOngoingSyncs{0};
|
std::atomic<int> RTLOngoingSyncs{0};
|
||||||
|
|
||||||
|
/// Check deleted and deprecated features, such as environment variables.
|
||||||
|
static void checkRuntimeEnvironment() {
|
||||||
|
const char *ShmemEnvarName = "LIBOMPTARGET_SHARED_MEMORY_SIZE";
|
||||||
|
if (std::getenv(ShmemEnvarName))
|
||||||
|
MESSAGE("Warning: %s is no longer valid. Please use OpenMP clause "
|
||||||
|
"'dyn_groupprivate' instead.\n",
|
||||||
|
ShmemEnvarName);
|
||||||
|
}
|
||||||
|
|
||||||
void initRuntime() {
|
void initRuntime() {
|
||||||
std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
|
std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
|
||||||
Profiler::get();
|
Profiler::get();
|
||||||
TIMESCOPE();
|
TIMESCOPE();
|
||||||
|
|
||||||
|
checkRuntimeEnvironment();
|
||||||
|
|
||||||
if (PM == nullptr)
|
if (PM == nullptr)
|
||||||
PM = new PluginManager();
|
PM = new PluginManager();
|
||||||
|
|
||||||
|
|||||||
@@ -211,7 +211,7 @@ DeviceTy::loadBinary(__tgt_device_image *Img) {
|
|||||||
DeviceEnvironment.NumDevices = RTL->getNumDevices();
|
DeviceEnvironment.NumDevices = RTL->getNumDevices();
|
||||||
// TODO: The device ID used here is not the real device ID used by OpenMP.
|
// TODO: The device ID used here is not the real device ID used by OpenMP.
|
||||||
DeviceEnvironment.DeviceNum = RTLDeviceID;
|
DeviceEnvironment.DeviceNum = RTLDeviceID;
|
||||||
DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize();
|
DeviceEnvironment.DynamicMemSize = 0;
|
||||||
DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency();
|
DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency();
|
||||||
DeviceEnvironment.IndirectCallTable =
|
DeviceEnvironment.IndirectCallTable =
|
||||||
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
|
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
|
||||||
|
|||||||
@@ -3822,10 +3822,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
|||||||
KernelArgs.DynCGroupMem);
|
KernelArgs.DynCGroupMem);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increase to the requested dynamic memory size for the device if needed.
|
|
||||||
DynBlockMemSize =
|
|
||||||
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
|
|
||||||
|
|
||||||
// HSA requires the group segment size to include both static and dynamic.
|
// HSA requires the group segment size to include both static and dynamic.
|
||||||
uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
|
uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
|
||||||
|
|
||||||
|
|||||||
@@ -1023,7 +1023,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
|
|||||||
return GridValues.GV_Default_Num_Teams;
|
return GridValues.GV_Default_Num_Teams;
|
||||||
}
|
}
|
||||||
uint32_t getDebugKind() const { return OMPX_DebugKind; }
|
uint32_t getDebugKind() const { return OMPX_DebugKind; }
|
||||||
uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
|
|
||||||
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
|
virtual uint64_t getClockFrequency() const { return CLOCKS_PER_SEC; }
|
||||||
|
|
||||||
/// Get target compute unit kind (e.g., sm_80, or gfx908).
|
/// Get target compute unit kind (e.g., sm_80, or gfx908).
|
||||||
@@ -1196,7 +1195,6 @@ private:
|
|||||||
|
|
||||||
/// Environment variables defined by the LLVM OpenMP implementation.
|
/// Environment variables defined by the LLVM OpenMP implementation.
|
||||||
Int32Envar OMPX_DebugKind;
|
Int32Envar OMPX_DebugKind;
|
||||||
UInt32Envar OMPX_SharedMemorySize;
|
|
||||||
UInt64Envar OMPX_TargetStackSize;
|
UInt64Envar OMPX_TargetStackSize;
|
||||||
UInt64Envar OMPX_TargetHeapSize;
|
UInt64Envar OMPX_TargetHeapSize;
|
||||||
|
|
||||||
|
|||||||
@@ -776,7 +776,6 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
|
|||||||
OMP_NumTeams("OMP_NUM_TEAMS"),
|
OMP_NumTeams("OMP_NUM_TEAMS"),
|
||||||
OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
|
OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
|
||||||
OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
|
OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
|
||||||
OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
|
|
||||||
// Do not initialize the following two envars since they depend on the
|
// Do not initialize the following two envars since they depend on the
|
||||||
// device initialization. These cannot be consulted until the device is
|
// device initialization. These cannot be consulted until the device is
|
||||||
// initialized correctly. We initialize them in GenericDeviceTy::init().
|
// initialized correctly. We initialize them in GenericDeviceTy::init().
|
||||||
|
|||||||
@@ -1491,10 +1491,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
|||||||
if (GenericDevice.getRPCServer())
|
if (GenericDevice.getRPCServer())
|
||||||
GenericDevice.Plugin.getRPCServer().Thread->notify();
|
GenericDevice.Plugin.getRPCServer().Thread->notify();
|
||||||
|
|
||||||
// Increase to the requested dynamic memory size for the device if needed.
|
|
||||||
DynBlockMemSize =
|
|
||||||
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
|
|
||||||
|
|
||||||
// In case we require more memory than the current limit.
|
// In case we require more memory than the current limit.
|
||||||
if (DynBlockMemSize >= MaxDynBlockMemSize) {
|
if (DynBlockMemSize >= MaxDynBlockMemSize) {
|
||||||
CUresult AttrResult = cuFuncSetAttribute(
|
CUresult AttrResult = cuFuncSetAttribute(
|
||||||
|
|||||||
@@ -1,31 +0,0 @@
|
|||||||
// RUN: %libomptarget-compile-generic
|
|
||||||
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
|
|
||||||
// RUN: %libomptarget-run-generic | %fcheck-generic
|
|
||||||
|
|
||||||
// RUN: %libomptarget-compileopt-generic
|
|
||||||
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
|
|
||||||
// RUN: %libomptarget-run-generic | %fcheck-generic
|
|
||||||
|
|
||||||
// REQUIRES: gpu
|
|
||||||
// XFAIL: intelgpu
|
|
||||||
|
|
||||||
#include <omp.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
int x;
|
|
||||||
#pragma omp target parallel map(from : x)
|
|
||||||
{
|
|
||||||
int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
|
|
||||||
#pragma omp barrier
|
|
||||||
if (omp_get_thread_num() == 0)
|
|
||||||
*buf = 1;
|
|
||||||
#pragma omp barrier
|
|
||||||
if (omp_get_thread_num() == 1)
|
|
||||||
x = *buf;
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK: PASS
|
|
||||||
if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
|
|
||||||
printf("PASS\n");
|
|
||||||
}
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mllvm \
|
|
||||||
// RUN: -openmp-opt-inline-device
|
|
||||||
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
|
|
||||||
// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
|
|
||||||
// REQUIRES: amdgcn-amd-amdhsa
|
|
||||||
|
|
||||||
#include <omp.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
int x;
|
|
||||||
#pragma omp target parallel map(from : x)
|
|
||||||
{
|
|
||||||
int *buf = llvm_omp_target_dynamic_shared_alloc() + 252;
|
|
||||||
#pragma omp barrier
|
|
||||||
if (omp_get_thread_num() == 0)
|
|
||||||
*buf = 1;
|
|
||||||
#pragma omp barrier
|
|
||||||
if (omp_get_thread_num() == 1)
|
|
||||||
x = *buf;
|
|
||||||
}
|
|
||||||
|
|
||||||
// CHECK: PASS
|
|
||||||
if (x == 1 && llvm_omp_target_dynamic_shared_alloc() == NULL)
|
|
||||||
printf("PASS\n");
|
|
||||||
}
|
|
||||||
@@ -733,7 +733,6 @@ variables is defined below.
|
|||||||
* ``LIBOMPTARGET_INFO=<Num>``
|
* ``LIBOMPTARGET_INFO=<Num>``
|
||||||
* ``LIBOMPTARGET_HEAP_SIZE=<Num>``
|
* ``LIBOMPTARGET_HEAP_SIZE=<Num>``
|
||||||
* ``LIBOMPTARGET_STACK_SIZE=<Num>``
|
* ``LIBOMPTARGET_STACK_SIZE=<Num>``
|
||||||
* ``LIBOMPTARGET_SHARED_MEMORY_SIZE=<Num>``
|
|
||||||
* ``LIBOMPTARGET_MAP_FORCE_ATOMIC=[TRUE/FALSE] (default TRUE)``
|
* ``LIBOMPTARGET_MAP_FORCE_ATOMIC=[TRUE/FALSE] (default TRUE)``
|
||||||
* ``LIBOMPTARGET_TREAT_ATTACH_AUTO_AS_ALWAYS=[TRUE/FALSE] (default FALSE)``
|
* ``LIBOMPTARGET_TREAT_ATTACH_AUTO_AS_ALWAYS=[TRUE/FALSE] (default FALSE)``
|
||||||
* ``LIBOMPTARGET_JIT_OPT_LEVEL={0,1,2,3} (default 3)``
|
* ``LIBOMPTARGET_JIT_OPT_LEVEL={0,1,2,3} (default 3)``
|
||||||
@@ -1059,14 +1058,6 @@ allocated using ``malloc`` and ``free`` for the CUDA plugin. This is necessary
|
|||||||
for some applications that allocate too much memory either through the user or
|
for some applications that allocate too much memory either through the user or
|
||||||
globalization.
|
globalization.
|
||||||
|
|
||||||
LIBOMPTARGET_SHARED_MEMORY_SIZE
|
|
||||||
"""""""""""""""""""""""""""""""
|
|
||||||
|
|
||||||
This environment variable sets the amount of dynamic shared memory in bytes used
|
|
||||||
by the kernel once it is launched. A pointer to the dynamic memory buffer can be
|
|
||||||
accessed using the ``llvm_omp_target_dynamic_shared_alloc`` function. An example
|
|
||||||
is shown in :ref:`libomptarget_dynamic_shared`.
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:hidden:
|
:hidden:
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
@@ -1233,7 +1224,6 @@ Environment Variables
|
|||||||
|
|
||||||
There are several environment variables to change the behavior of the plugins:
|
There are several environment variables to change the behavior of the plugins:
|
||||||
|
|
||||||
* ``LIBOMPTARGET_SHARED_MEMORY_SIZE``
|
|
||||||
* ``LIBOMPTARGET_STACK_SIZE``
|
* ``LIBOMPTARGET_STACK_SIZE``
|
||||||
* ``LIBOMPTARGET_HEAP_SIZE``
|
* ``LIBOMPTARGET_HEAP_SIZE``
|
||||||
* ``LIBOMPTARGET_NUM_INITIAL_STREAMS``
|
* ``LIBOMPTARGET_NUM_INITIAL_STREAMS``
|
||||||
@@ -1247,8 +1237,8 @@ There are several environment variables to change the behavior of the plugins:
|
|||||||
* ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS``
|
* ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS``
|
||||||
* ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT``
|
* ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT``
|
||||||
|
|
||||||
The environment variables ``LIBOMPTARGET_SHARED_MEMORY_SIZE``,
|
The environment variables ``LIBOMPTARGET_STACK_SIZE`` and
|
||||||
``LIBOMPTARGET_STACK_SIZE`` and ``LIBOMPTARGET_HEAP_SIZE`` are described in
|
``LIBOMPTARGET_HEAP_SIZE`` are described in
|
||||||
:ref:`libopenmptarget_environment_vars`.
|
:ref:`libopenmptarget_environment_vars`.
|
||||||
|
|
||||||
LIBOMPTARGET_NUM_INITIAL_STREAMS
|
LIBOMPTARGET_NUM_INITIAL_STREAMS
|
||||||
@@ -1401,6 +1391,10 @@ LIBOMPTARGET_RPC_LATENCY
|
|||||||
""""""""""""""""""""""""
|
""""""""""""""""""""""""
|
||||||
This is the maximum amount of time the client will wait for a response from the server.
|
This is the maximum amount of time the client will wait for a response from the server.
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
The ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` environment variable is not
|
||||||
|
supported anymore. Please use the ``dyn_groupprivate`` clause instead, as
|
||||||
|
shown in :ref:`libomptarget_dynamic_shared`.
|
||||||
|
|
||||||
.. _libomptarget_libc:
|
.. _libomptarget_libc:
|
||||||
|
|
||||||
@@ -1463,35 +1457,21 @@ IR during compilation.
|
|||||||
Dynamic Shared Memory
|
Dynamic Shared Memory
|
||||||
^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
The target device runtime contains a pointer to the dynamic shared memory
|
The OpenMP implementation provides access to dynamic shared memory in ``target``
|
||||||
buffer. This pointer can be obtained using the
|
regions through the ``dyn_groupprivate`` clause, introduced in OpenMP 6.1. This
|
||||||
|
is the preferred method to obtain dynamic shared memory. Please refer to
|
||||||
|
the OpenMP standard documentation for more information.
|
||||||
|
|
||||||
|
As an alternative, the target device runtime contains a pointer to the native
|
||||||
|
dynamic shared memory buffer. This pointer can be obtained using the
|
||||||
``llvm_omp_target_dynamic_shared_alloc`` extension. If this function is called
|
``llvm_omp_target_dynamic_shared_alloc`` extension. If this function is called
|
||||||
from the host it will simply return a null pointer. In order to use this buffer
|
from the host it will simply return a null pointer. In order to use this buffer
|
||||||
the kernel must be launched with an adequate amount of dynamic shared memory
|
the kernel must be launched with an adequate amount of dynamic shared memory
|
||||||
allocated. This can be done using the ``LIBOMPTARGET_SHARED_MEMORY_SIZE``
|
allocated. This can be done using the ``ompx_dyn_cgroup_mem(<N>)`` target
|
||||||
environment variable or the ``ompx_dyn_cgroup_mem(<N>)`` target directive
|
directive clause. An example is given below.
|
||||||
clause. Examples for both are given below.
|
|
||||||
|
|
||||||
.. code-block:: c++
|
Please notice that the ``LIBOMPTARGET_SHARED_MEMORY_SIZE`` environment variable
|
||||||
|
is not supported anymore.
|
||||||
void foo() {
|
|
||||||
int x;
|
|
||||||
#pragma omp target parallel map(from : x)
|
|
||||||
{
|
|
||||||
int *buf = llvm_omp_target_dynamic_shared_alloc();
|
|
||||||
if (omp_get_thread_num() == 0)
|
|
||||||
*buf = 1;
|
|
||||||
#pragma omp barrier
|
|
||||||
if (omp_get_thread_num() == 1)
|
|
||||||
x = *buf;
|
|
||||||
}
|
|
||||||
assert(x == 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
.. code-block:: console
|
|
||||||
|
|
||||||
$ clang++ -fopenmp --offload-arch=sm_80 -O3 shared.c
|
|
||||||
$ env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 ./shared
|
|
||||||
|
|
||||||
.. code-block:: c++
|
.. code-block:: c++
|
||||||
|
|
||||||
@@ -1509,11 +1489,6 @@ clause. Examples for both are given below.
|
|||||||
assert(x == 1);
|
assert(x == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
.. code-block:: console
|
|
||||||
|
|
||||||
$ clang++ -fopenmp --offload-arch=gfx90a -O3 shared.c
|
|
||||||
$ env ./shared
|
|
||||||
|
|
||||||
.. _libomptarget_device_allocator:
|
.. _libomptarget_device_allocator:
|
||||||
|
|
||||||
Device Allocation
|
Device Allocation
|
||||||
|
|||||||
Reference in New Issue
Block a user