[flang-rt] Remove experiemental OpenMP offloading support (#183653)
Summary: This, as far as I am aware, has mostly been superceded by the runtimes build that's built on top of libc. This build links 30% faster, supports more functionality, and uses 95% less disk space, so it seems to be the direction we want to go. CUDA support remains, this is not needed urgently.
This commit is contained in:
@@ -179,11 +179,10 @@ if (NOT FLANG_RT_ENABLE_STATIC AND NOT FLANG_RT_ENABLE_SHARED)
|
||||
endif ()
|
||||
|
||||
|
||||
set(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT "" CACHE STRING "Compile Flang-RT with GPU support (CUDA or OpenMP)")
|
||||
set(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT "" CACHE STRING "Compile Flang-RT with GPU support (CUDA)")
|
||||
set_property(CACHE FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT PROPERTY STRINGS
|
||||
""
|
||||
CUDA
|
||||
OpenMP
|
||||
)
|
||||
if (NOT FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT)
|
||||
# Support for GPUs disabled
|
||||
@@ -191,30 +190,8 @@ elseif (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
|
||||
# Support for CUDA
|
||||
set(FLANG_RT_LIBCUDACXX_PATH "" CACHE PATH "Path to libcu++ package installation")
|
||||
option(FLANG_RT_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS "Do not compile global variables' definitions when producing PTX library" OFF)
|
||||
elseif (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP")
|
||||
# Support for OpenMP offloading
|
||||
set(FLANG_RT_DEVICE_ARCHITECTURES "all" CACHE STRING
|
||||
"List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')"
|
||||
)
|
||||
|
||||
if (FLANG_RT_DEVICE_ARCHITECTURES STREQUAL "all")
|
||||
# TODO: support auto detection on the build system.
|
||||
set(all_amdgpu_architectures
|
||||
"gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
|
||||
"gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030"
|
||||
"gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
|
||||
"gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151"
|
||||
"gfx1152;gfx1153;gfx1170")
|
||||
set(all_nvptx_architectures
|
||||
"sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
|
||||
"sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90")
|
||||
set(all_gpu_architectures
|
||||
"${all_amdgpu_architectures};${all_nvptx_architectures}")
|
||||
set(FLANG_RT_DEVICE_ARCHITECTURES ${all_gpu_architectures})
|
||||
endif()
|
||||
list(REMOVE_DUPLICATES FLANG_RT_DEVICE_ARCHITECTURES)
|
||||
else ()
|
||||
message(FATAL_ERROR "Invalid value '${FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT}' for FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT; must be empty, 'CUDA', or 'OpenMP'")
|
||||
message(FATAL_ERROR "Invalid value '${FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT}' for FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT; must be empty or 'CUDA'")
|
||||
endif ()
|
||||
|
||||
|
||||
|
||||
@@ -146,16 +146,12 @@ CMake itself provide.
|
||||
the compiler for `__float128` or 128-bit `long double` support.
|
||||
[More details](docs/Real16MathSupport.md).
|
||||
|
||||
* `FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT` (values: `"CUDA"`,`"OpenMP"`, `""` default: `""`)
|
||||
* `FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT` (values: `"CUDA"`, `""` default: `""`)
|
||||
|
||||
When set to `CUDA`, builds Flang-RT with experimental support for GPU
|
||||
accelerators using CUDA. `CMAKE_CUDA_COMPILER` must be set if not
|
||||
automatically detected by CMake. `nvcc` as well as `clang` are supported.
|
||||
|
||||
When set to `OpenMP`, builds Flang-RT with experimental support for
|
||||
GPU accelerators using OpenMP offloading. Only Clang is supported for
|
||||
`CMAKE_C_COMPILER` and `CMAKE_CXX_COMPILER`.
|
||||
|
||||
* `FLANG_RT_INCLUDE_CUF` (bool, default: `OFF`)
|
||||
|
||||
Compiles the `libflang_rt.cuda_<CUDA-version>.a/.so` library. This is
|
||||
@@ -181,13 +177,10 @@ additional configuration options become available.
|
||||
default.
|
||||
|
||||
|
||||
### Experimental OpenMP Offload Support
|
||||
|
||||
With `-DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=OpenMP`, the following
|
||||
additional configuration options become available.
|
||||
### GPU Offloading Support
|
||||
|
||||
* `FLANG_RT_DEVICE_ARCHITECTURES` (default: `"all"`)
|
||||
|
||||
A list of device architectures that Flang-RT is going to support.
|
||||
If `"all"` uses a pre-defined list of architectures. Same purpose as
|
||||
`LIBOMPTARGET_DEVICE_ARCHITECTURES` from liboffload.
|
||||
Flang-RT can be built for GPU targets (AMDGPU, NVPTX) using the LLVM
|
||||
runtimes build infrastructure. The easiest way to configure a build for
|
||||
GPU offloading is via the CMake cache file at
|
||||
`offload/cmake/caches/FlangOffload.cmake`.
|
||||
|
||||
@@ -71,45 +71,3 @@ macro(enable_cuda_compilation name files)
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
macro(enable_omp_offload_compilation name files)
|
||||
if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP")
|
||||
# OpenMP offload build only works with Clang compiler currently.
|
||||
|
||||
if (FLANG_RT_ENABLE_SHARED)
|
||||
message(FATAL_ERROR
|
||||
"FLANG_RT_ENABLE_SHARED is not supported for OpenMP offload build of Flang-RT"
|
||||
)
|
||||
endif()
|
||||
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND
|
||||
"${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
|
||||
|
||||
string(REPLACE ";" "," compile_for_architectures
|
||||
"${FLANG_RT_DEVICE_ARCHITECTURES}"
|
||||
)
|
||||
|
||||
set(OMP_COMPILE_OPTIONS
|
||||
-fopenmp
|
||||
-fvisibility=hidden
|
||||
-fopenmp-cuda-mode
|
||||
--offload-arch=${compile_for_architectures}
|
||||
# Force LTO for the device part.
|
||||
-foffload-lto
|
||||
)
|
||||
set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS
|
||||
"${OMP_COMPILE_OPTIONS}"
|
||||
)
|
||||
target_link_options(${name}.static PUBLIC ${OMP_COMPILE_OPTIONS})
|
||||
|
||||
# Enable "declare target" in the source code.
|
||||
set_source_files_properties(${files}
|
||||
PROPERTIES COMPILE_DEFINITIONS OMP_OFFLOAD_BUILD
|
||||
)
|
||||
else()
|
||||
message(FATAL_ERROR
|
||||
"Flang-rt build with OpenMP offload is not supported for these compilers:\n"
|
||||
"CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}\n"
|
||||
"CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
|
||||
endif()
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
@@ -40,11 +40,7 @@ struct ExecutionEnvironment {
|
||||
typedef void (*ConfigEnvCallbackPtr)(
|
||||
int, const char *[], const char *[], const EnvironmentDefaultList *);
|
||||
|
||||
#if !defined(_OPENMP)
|
||||
// FIXME: https://github.com/llvm/llvm-project/issues/84942
|
||||
constexpr
|
||||
#endif
|
||||
ExecutionEnvironment(){};
|
||||
constexpr ExecutionEnvironment() {};
|
||||
void Configure(int argc, const char *argv[], const char *envp[],
|
||||
const EnvironmentDefaultList *envDefaults);
|
||||
|
||||
|
||||
@@ -200,7 +200,6 @@ if (NOT WIN32)
|
||||
)
|
||||
|
||||
enable_cuda_compilation(flang_rt.runtime "${supported_sources}")
|
||||
enable_omp_offload_compilation(flang_rt.runtime "${supported_sources}")
|
||||
|
||||
# Select a default runtime, which is used for unit and regression tests.
|
||||
get_target_property(default_target flang_rt.runtime.default ALIASED_TARGET)
|
||||
@@ -234,7 +233,6 @@ else()
|
||||
)
|
||||
|
||||
enable_cuda_compilation(${name} "${supported_sources}")
|
||||
enable_omp_offload_compilation(${name} "${supported_sources}")
|
||||
add_dependencies(flang_rt.runtime ${name})
|
||||
endfunction ()
|
||||
|
||||
|
||||
@@ -16,9 +16,6 @@
|
||||
#include "flang-rt/runtime/lock.h"
|
||||
#include "flang-rt/runtime/tools.h"
|
||||
|
||||
// NOTE: the header files above may define OpenMP declare target
|
||||
// variables, so they have to be included unconditionally
|
||||
// so that the offload entries are consistent between host and device.
|
||||
#if !defined(RT_USE_PSEUDO_FILE_UNIT)
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
@@ -15,9 +15,6 @@
|
||||
#include "flang-rt/runtime/io-error.h"
|
||||
#include "flang-rt/runtime/tools.h"
|
||||
|
||||
// NOTE: the header files above may define OpenMP declare target
|
||||
// variables, so they have to be included unconditionally
|
||||
// so that the offload entries are consistent between host and device.
|
||||
#if defined(RT_USE_PSEUDO_FILE_UNIT)
|
||||
#include <cstdio>
|
||||
|
||||
|
||||
@@ -14,8 +14,7 @@
|
||||
|
||||
namespace Fortran::runtime {
|
||||
|
||||
#if !defined(RT_DEVICE_COMPILATION) && !defined(OMP_OFFLOAD_BUILD)
|
||||
// FLANG_RT_DEBUG code is disabled when false.
|
||||
#if !defined(RT_DEVICE_COMPILATION)
|
||||
static constexpr bool enableDebugOutput{false};
|
||||
#endif
|
||||
|
||||
@@ -79,7 +78,7 @@ RT_API_ATTRS Ticket &WorkQueue::StartTicket() {
|
||||
last_ = newTicket;
|
||||
}
|
||||
newTicket->ticket.begun = false;
|
||||
#if !defined(RT_DEVICE_COMPILATION) && !defined(OMP_OFFLOAD_BUILD)
|
||||
#if !defined(RT_DEVICE_COMPILATION)
|
||||
if (enableDebugOutput &&
|
||||
(executionEnvironment.internalDebugging &
|
||||
ExecutionEnvironment::WorkQueue)) {
|
||||
@@ -93,7 +92,7 @@ RT_API_ATTRS int WorkQueue::Run() {
|
||||
while (last_) {
|
||||
TicketList *at{last_};
|
||||
insertAfter_ = last_;
|
||||
#if !defined(RT_DEVICE_COMPILATION) && !defined(OMP_OFFLOAD_BUILD)
|
||||
#if !defined(RT_DEVICE_COMPILATION)
|
||||
if (enableDebugOutput &&
|
||||
(executionEnvironment.internalDebugging &
|
||||
ExecutionEnvironment::WorkQueue)) {
|
||||
@@ -102,7 +101,7 @@ RT_API_ATTRS int WorkQueue::Run() {
|
||||
}
|
||||
#endif
|
||||
int stat{at->ticket.Continue(*this)};
|
||||
#if !defined(RT_DEVICE_COMPILATION) && !defined(OMP_OFFLOAD_BUILD)
|
||||
#if !defined(RT_DEVICE_COMPILATION)
|
||||
if (enableDebugOutput &&
|
||||
(executionEnvironment.internalDebugging &
|
||||
ExecutionEnvironment::WorkQueue)) {
|
||||
|
||||
@@ -42,18 +42,6 @@ function(add_flangrt_unittest_offload_properties target)
|
||||
PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
|
||||
)
|
||||
endif()
|
||||
# Enable OpenMP offload during linking. We may need to replace
|
||||
# LINK_OPTIONS with COMPILE_OPTIONS when there are OpenMP offload
|
||||
# unittests.
|
||||
#
|
||||
# FIXME: replace 'native' in --offload-arch option with the list
|
||||
# of targets that Fortran Runtime was built for.
|
||||
if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP")
|
||||
set_target_properties(${target}
|
||||
PROPERTIES LINK_OPTIONS
|
||||
"-fopenmp;--offload-arch=native"
|
||||
)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# flang-rt on Windows requires compiler-rt for some symbols. For binaries built
|
||||
|
||||
@@ -204,9 +204,18 @@ ninja install
|
||||
|
||||
|
||||
### Building Flang-RT for accelerators
|
||||
Flang runtime can be built for accelerators in experimental mode, i.e.
|
||||
complete enabling is WIP. CUDA and OpenMP target offload builds
|
||||
are currently supported.
|
||||
Flang runtime can be built for GPU targets (AMDGPU, NVPTX) using the LLVM
|
||||
runtimes build infrastructure. The recommended way to configure a build for GPU
|
||||
offloading is via the CMake cache file provided by `offload`.
|
||||
|
||||
```bash
|
||||
cmake ../llvm -G Ninja \
|
||||
-C ../offload/cmake/caches/FlangOffload.cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_INSTALL_PREFIX=<PATH>
|
||||
```
|
||||
|
||||
An experimental CUDA build of the runtime is also available.
|
||||
|
||||
#### Building out-of-tree
|
||||
|
||||
@@ -299,33 +308,6 @@ number sufficiently low for all build jobs to fit into the available RAM. Using
|
||||
the number of harware threads (`nprocs`) is likely too much for most
|
||||
commodity machines.
|
||||
|
||||
##### OpenMP target offload build
|
||||
Only Clang compiler is currently supported.
|
||||
|
||||
```bash
|
||||
cd llvm-project
|
||||
rm -rf build_flang_runtime
|
||||
mkdir build_flang_runtime
|
||||
cd build_flang_runtime
|
||||
|
||||
cmake \
|
||||
-DLLVM_ENABLE_RUNTIMES=flang-rt \
|
||||
-DFLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT="OpenMP" \
|
||||
-DCMAKE_C_COMPILER=clang \
|
||||
-DCMAKE_CXX_COMPILER=clang++ \
|
||||
-DFLANG_RT_DEVICE_ARCHITECTURES=all \
|
||||
../runtimes/
|
||||
|
||||
make flang-rt
|
||||
```
|
||||
|
||||
The result of the build is a "device-only" library, i.e. the host
|
||||
part of the library is just a container for the device code.
|
||||
The resulting library may be linked to user programs using
|
||||
Clang-like device linking pipeline.
|
||||
|
||||
The same set of CMake variables works for Flang in-tree build.
|
||||
|
||||
### Build options
|
||||
|
||||
One may provide optional CMake variables to customize the build. Available options:
|
||||
|
||||
Reference in New Issue
Block a user