[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (#152831)

Part 3 adding offload runtime support. See https://github.com/llvm/llvm-project/pull/152651. --------- Co-authored-by: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
2026-03-12 01:13:06 -07:00
parent 11e0d6ae4b
commit 1f583c6dee
27 changed files with 613 additions and 65 deletions
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -101,7 +101,7 @@ __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
 __OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
                  ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
 __OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false,
-                  Int32, Int32)
+                  VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
 #undef __OMP_STRUCT_TYPE
 #undef OMP_STRUCT_TYPE
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -102,8 +102,9 @@ struct KernelArgsTy {
  struct {
    uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
-    uint64_t Unused : 62;
+    uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
-  } Flags = {0, 0, 0};
+    uint64_t Unused : 60;
  } Flags = {0, 0, 0, 0};
  // The number of teams (for x,y,z dimension).
  uint32_t NumTeams[3] = {0, 0, 0};
  // The number of threads (for x,y,z dimension).
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -70,10 +70,25 @@ struct KernelEnvironmentTy {
  DynamicEnvironmentTy *DynamicEnv = nullptr;
 };
 /// The fallback types for the dynamic cgroup memory.
 enum class DynCGroupMemFallbackType : uint8_t {
  /// None. Used for indicating that no fallback was triggered.
  None = 0,
  /// Abort the execution.
  Abort = None,
  /// Return null pointer.
  Null = 1,
  /// Allocate from a implementation defined memory space.
  DefaultMem = 2
 };
 struct KernelLaunchEnvironmentTy {
  void *ReductionBuffer = nullptr;
  void *DynCGroupMemFbPtr = nullptr;
  uint32_t ReductionCnt = 0;
  uint32_t ReductionIterCnt = 0;
-  void *ReductionBuffer = nullptr;
+  uint32_t DynCGroupMemSize = 0;
  DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None;
 };
 #endif // OMPTARGET_SHARED_ENVIRONMENT_H
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -37,6 +37,8 @@
 #include "PluginInterface.h"
 using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
 using DeviceInfo = llvm::omp::target::plugin::DeviceInfo;
 using InfoTreeNode = llvm::omp::target::plugin::InfoTreeNode;
 // Forward declarations.
 struct __tgt_bin_desc;
@@ -167,6 +169,20 @@ struct DeviceTy {
  /// Indicate that there are pending images for this device or not.
  void setHasPendingImages(bool V) { HasPendingImages = V; }
  /// Get information from the device.
  template <typename T> T getInfo(DeviceInfo Info) const {
    InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID);
    auto EntryOpt = DevInfo.get(Info);
    if (!EntryOpt)
      return 0;
    auto Entry = *EntryOpt;
    if (!std::holds_alternative<T>(Entry->Value))
      return T{};
    return std::get<T>(Entry->Value);
  }
 private:
  /// Deinitialize the device (and plugin).
  void deinit();
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -274,12 +274,23 @@ struct __tgt_target_non_contig {
 extern "C" {
 #endif
 /// The OpenMP access group type. The criterion for grouping tasks using a
 /// specific grouping property.
 enum omp_access_t {
  /// Groups the tasks based on the contention group to which they belong.
  omp_access_cgroup = 0,
  /// Groups the tasks based on the parallel region to which they bind.
  omp_access_pteam = 1,
 };
 void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
 int omp_get_device_from_uid(const char *DeviceUid);
 const char *omp_get_uid_from_device(int DeviceNum);
 int omp_get_initial_device(void);
 size_t omp_get_gprivate_limit(int DeviceNum,
                              omp_access_t AccessGroup = omp_access_cgroup);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
 int omp_target_is_present(const void *Ptr, int DeviceNum);
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -138,6 +138,22 @@ EXTERN int omp_get_initial_device(void) {
  return HostDevice;
 }
 EXTERN size_t omp_get_gprivate_limit(int DeviceNum, omp_access_t AccessGroup) {
  TIMESCOPE();
  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
  if (DeviceNum == omp_get_initial_device())
    return 0;
  if (AccessGroup != omp_access_cgroup)
    return 0;
  auto DeviceOrErr = PM->getDevice(DeviceNum);
  if (!DeviceOrErr)
    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
  return DeviceOrErr->getInfo<uint64_t>(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
 }
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
                         ";size=" + std::to_string(Size));
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -43,6 +43,7 @@ VERS1.0 {
    omp_get_device_from_uid;
    omp_get_uid_from_device;
    omp_get_initial_device;
    omp_get_gprivate_limit;
    omp_target_alloc;
    omp_target_free;
    omp_target_is_accessible;
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -559,6 +559,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
        return Err;
    }
    // Set the static block memory size required by the kernel.
    StaticBlockMemSize = GroupSize;
    // Make sure it is a kernel symbol.
    if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
      return Plugin::error(ErrorCode::INVALID_BINARY,
@@ -582,8 +585,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
  /// Launch the AMDGPU kernel function.
  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
-                   KernelLaunchParamsTy LaunchParams,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
  /// Return maximum block size for maximum occupancy
@@ -3220,7 +3223,7 @@ private:
    KernelArgsTy KernelArgs = {};
    uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
    if (auto Err = AMDGPUKernel.launchImpl(
-            *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+            *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
            KernelLaunchParamsTy{}, AsyncInfoWrapper))
      return Err;
@@ -3755,6 +3758,7 @@ private:
 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                 uint32_t NumThreads[3], uint32_t NumBlocks[3],
                                 uint32_t DynBlockMemSize,
                                 KernelArgsTy &KernelArgs,
                                 KernelLaunchParamsTy LaunchParams,
                                 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3767,13 +3771,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
  if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
    return Err;
  // Account for user requested dynamic shared memory.
  uint32_t GroupSize = getGroupSize();
  if (uint32_t MaxDynCGroupMem = std::max(
          KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
    GroupSize += MaxDynCGroupMem;
  }
  uint64_t StackSize;
  if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
    return Err;
@@ -3825,9 +3822,17 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                           KernelArgs.DynCGroupMem);
  }
  // Increase to the requested dynamic memory size for the device if needed.
  DynBlockMemSize =
      std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
  // HSA requires the group segment size to include both static and dynamic.
  uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
  // Push the kernel launch into the stream.
  return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
+                                  TotalBlockMemSize, StackSize,
                                  ArgsMemoryManager);
 }
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -301,6 +301,18 @@ private:
  }
 };
 /// Configuration of dynamic block memory needed for launching a kernel.
 struct DynBlockMemConfTy {
  /// The size of the dynamic block memory buffer.
  uint32_t Size = 0;
  /// The size of dynamic shared memory natively provided by the device.
  uint32_t NativeSize = 0;
  /// The fallback that was triggered (if any).
  DynCGroupMemFallbackType Fallback = DynCGroupMemFallbackType::None;
  /// The fallback pointer if global memory was used as alternative.
  void *FallbackPtr = nullptr;
 };
 /// Class wrapping a __tgt_device_image and its offload entry table on a
 /// specific device. This class is responsible for storing and managing
 /// the offload entries for an image on a device.
@@ -363,7 +375,7 @@ struct GenericKernelTy {
               AsyncInfoWrapperTy &AsyncInfoWrapper) const;
  virtual Error launchImpl(GenericDeviceTy &GenericDevice,
                           uint32_t NumThreads[3], uint32_t NumBlocks[3],
-                           KernelArgsTy &KernelArgs,
+                           uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
                           KernelLaunchParamsTy LaunchParams,
                           AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
@@ -373,6 +385,9 @@ struct GenericKernelTy {
  /// Get the kernel name.
  const char *getName() const { return Name.c_str(); }
  /// Get the size of the static per-block memory consumed by the kernel.
  uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
  /// Get the kernel image.
  DeviceImageTy &getImage() const {
    assert(ImagePtr && "Kernel is not initialized!");
@@ -386,8 +401,10 @@ struct GenericKernelTy {
  /// Return a device pointer to a new kernel launch environment.
  Expected<KernelLaunchEnvironmentTy *>
-  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
+  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
-                             AsyncInfoWrapperTy &AsyncInfo) const;
+                             const KernelArgsTy &KernelArgs,
                             const DynBlockMemConfTy &DynBlockMemConf,
                             AsyncInfoWrapperTy &AsyncInfoWrapper) const;
  /// Indicate whether an execution mode is valid.
  static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -433,6 +450,12 @@ protected:
                                       uint32_t NumBlocks[3]) const;
 private:
  /// Prepare the block memory buffer requested for the kernel and execute the
  /// specified fallback if necessary.
  Expected<DynBlockMemConfTy> prepareBlockMemory(GenericDeviceTy &GenericDevice,
                                                 KernelArgsTy &KernelArgs,
                                                 uint32_t NumBlocks) const;
  /// Prepare the arguments before launching the kernel.
  KernelLaunchParamsTy
  prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
@@ -487,6 +510,9 @@ protected:
  /// The maximum number of threads which the kernel could leverage.
  uint32_t MaxNumThreads;
  /// The static memory sized per block.
  uint32_t StaticBlockMemSize = 0;
  /// The kernel environment, including execution flags.
  KernelEnvironmentTy KernelEnvironment;
@@ -1498,6 +1524,9 @@ public:
  /// Query the current state of an asynchronous queue.
  int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
  /// Obtain information about the given device.
  InfoTreeNode obtain_device_info(int32_t DeviceId);
  /// Prints information about the given devices supported by the plugin.
  void print_device_info(int32_t DeviceId);
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -435,20 +435,21 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 Expected<KernelLaunchEnvironmentTy *>
 GenericKernelTy::getKernelLaunchEnvironment(
-    GenericDeviceTy &GenericDevice, uint32_t Version,
+    GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
    const DynBlockMemConfTy &DynBlockMemConf,
    AsyncInfoWrapperTy &AsyncInfoWrapper) const {
  // Ctor/Dtor have no arguments, replaying uses the original kernel launch
  // environment. Older versions of the compiler do not generate a kernel
  // launch environment.
  if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
-      Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+      KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
    return nullptr;
-  if (!KernelEnvironment.Configuration.ReductionDataSize ||
+  if ((!KernelEnvironment.Configuration.ReductionDataSize ||
-      !KernelEnvironment.Configuration.ReductionBufferLength)
+       !KernelEnvironment.Configuration.ReductionBufferLength) &&
      KernelArgs.DynCGroupMem == 0)
    return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
  // TODO: Check if the kernel needs a launch environment.
  auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
                                            /*HostPtr=*/nullptr,
                                            TargetAllocTy::TARGET_ALLOC_DEVICE);
@@ -462,7 +463,14 @@ GenericKernelTy::getKernelLaunchEnvironment(
  /// async data transfer.
  auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
  LocalKLE = KernelLaunchEnvironment;
-  {
+
  LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size;
  LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr;
  LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback;
  LocalKLE.ReductionBuffer = nullptr;
  if (KernelEnvironment.Configuration.ReductionDataSize &&
      KernelEnvironment.Configuration.ReductionBufferLength) {
    auto AllocOrErr = GenericDevice.dataAlloc(
        KernelEnvironment.Configuration.ReductionDataSize *
            KernelEnvironment.Configuration.ReductionBufferLength,
@@ -508,14 +516,81 @@ Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
  return Plugin::success();
 }
 Expected<DynBlockMemConfTy>
 GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice,
                                    KernelArgsTy &KernelArgs,
                                    uint32_t NumBlocks) const {
  uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
  uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
  uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
  uint32_t DynNativeBlockMemSize = DynBlockMemSize;
  void *DynFallbackPtr = nullptr;
  // No enough block memory to cover the static one. Cannot run the kernel.
  if (StaticBlockMemSize > MaxBlockMemSize)
    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
                         "Static block memory size exceeds maximum");
  // No enough block memory to cover dynamic one, and the fallback is aborting.
  if (static_cast<DynCGroupMemFallbackType>(
          KernelArgs.Flags.DynCGroupMemFallback) ==
          DynCGroupMemFallbackType::Abort &&
      TotalBlockMemSize > MaxBlockMemSize)
    return Plugin::error(
        ErrorCode::INVALID_ARGUMENT,
        "Requested block memory size (static + dynamic) exceeds maximum");
  DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None;
  if (DynBlockMemSize && TotalBlockMemSize > MaxBlockMemSize) {
    // Launch without native dynamic block memory.
    DynNativeBlockMemSize = 0;
    DynFallback = static_cast<DynCGroupMemFallbackType>(
        KernelArgs.Flags.DynCGroupMemFallback);
    if (DynFallback != DynCGroupMemFallbackType::DefaultMem) {
      // Do not provide any memory as fallback.
      DynBlockMemSize = 0;
    } else {
      // Get global memory as fallback.
      auto AllocOrErr = GenericDevice.dataAlloc(
          NumBlocks * DynBlockMemSize,
          /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
      if (!AllocOrErr)
        return AllocOrErr.takeError();
      DynFallbackPtr = *AllocOrErr;
    }
  }
  return DynBlockMemConfTy{DynBlockMemSize, DynNativeBlockMemSize, DynFallback,
                           DynFallbackPtr};
 }
 Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                              ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
                              AsyncInfoWrapperTy &AsyncInfoWrapper) const {
  llvm::SmallVector<void *, 16> Args;
  llvm::SmallVector<void *, 16> Ptrs;
  uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
                            KernelArgs.ThreadLimit[1],
                            KernelArgs.ThreadLimit[2]};
  uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
                           KernelArgs.NumTeams[2]};
  if (!isBareMode()) {
    NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
    NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
                                NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
  }
  auto DynBlockMemConfOrErr =
      prepareBlockMemory(GenericDevice, KernelArgs, NumBlocks[0]);
  if (!DynBlockMemConfOrErr)
    return DynBlockMemConfOrErr.takeError();
  DynBlockMemConfTy &DynBlockMemConf = *DynBlockMemConfOrErr;
  if (DynBlockMemConf.FallbackPtr)
    AsyncInfoWrapper.freeAllocationAfterSynchronization(
        DynBlockMemConf.FallbackPtr);
  auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
-      GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
+      GenericDevice, KernelArgs, DynBlockMemConf, AsyncInfoWrapper);
  if (!KernelLaunchEnvOrErr)
    return KernelLaunchEnvOrErr.takeError();
@@ -531,17 +606,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                    Args, Ptrs, *KernelLaunchEnvOrErr);
  }
  uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
                            KernelArgs.ThreadLimit[1],
                            KernelArgs.ThreadLimit[2]};
  uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
                           KernelArgs.NumTeams[2]};
  if (!isBareMode()) {
    NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
    NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
                                NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
  }
  // Record the kernel description after we modified the argument count and num
  // blocks/threads.
  RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
@@ -557,8 +621,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
          printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
    return Err;
-  return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
+  return launchImpl(GenericDevice, NumThreads, NumBlocks,
-                    LaunchParams, AsyncInfoWrapper);
+                    DynBlockMemConf.NativeSize, KernelArgs, LaunchParams,
                    AsyncInfoWrapper);
 }
 KernelLaunchParamsTy GenericKernelTy::prepareArgs(
@@ -1954,6 +2019,16 @@ int32_t GenericPluginTy::query_async(int32_t DeviceId,
  return OFFLOAD_SUCCESS;
 }
 InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
  auto InfoOrErr = getDevice(DeviceId).obtainInfo();
  if (auto Err = InfoOrErr.takeError()) {
    REPORT() << "Failure to obtain device " << DeviceId
             << " info: " << toString(std::move(Err));
    return InfoTreeNode{};
  }
  return std::move(*InfoOrErr);
 }
 void GenericPluginTy::print_device_info(int32_t DeviceId) {
  if (auto Err = getDevice(DeviceId).printInfo())
    REPORT() << "Failure to print device " << DeviceId
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -261,6 +261,7 @@ typedef enum CUdevice_attribute_enum {
 typedef enum CUfunction_attribute_enum {
  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
  CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
  CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
 } CUfunction_attribute;
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -150,14 +150,23 @@ struct CUDAKernelTy : public GenericKernelTy {
    // The maximum number of threads cannot exceed the maximum of the kernel.
    MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
    int SharedMemSize;
    Res = cuFuncGetAttribute(&SharedMemSize,
                             CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func);
    if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
      return Err;
    // Set the static block memory size required by the kernel.
    StaticBlockMemSize = SharedMemSize;
    // Retrieve the size of the arguments.
    return initArgsSize();
  }
  /// Launch the CUDA kernel function.
  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
-                   KernelLaunchParamsTy LaunchParams,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
  /// Return maximum block size for maximum occupancy
@@ -197,7 +206,7 @@ private:
  CUfunction Func;
  /// The maximum amount of dynamic shared memory per thread group. By default,
  /// this is set to 48 KB.
-  mutable uint32_t MaxDynCGroupMemLimit = 49152;
+  mutable uint32_t MaxDynBlockMemSize = 49152;
  /// The size of the kernel arguments.
  size_t ArgsSize;
@@ -1411,7 +1420,7 @@ private:
    KernelArgsTy KernelArgs = {};
    uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
    if (auto Err = CUDAKernel.launchImpl(
-            *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+            *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
            KernelLaunchParamsTy{}, AsyncInfoWrapper))
      return Err;
@@ -1455,6 +1464,7 @@ private:
 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                               uint32_t NumThreads[3], uint32_t NumBlocks[3],
                               uint32_t DynBlockMemSize,
                               KernelArgsTy &KernelArgs,
                               KernelLaunchParamsTy LaunchParams,
                               AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -1470,9 +1480,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
  if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
    return Err;
  uint32_t MaxDynCGroupMem =
      std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
  size_t ConfigArgsSize = ArgsSize;
  void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
                    CU_LAUNCH_PARAM_BUFFER_SIZE,
@@ -1484,20 +1491,24 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
  if (GenericDevice.getRPCServer())
    GenericDevice.Plugin.getRPCServer().Thread->notify();
  // Increase to the requested dynamic memory size for the device if needed.
  DynBlockMemSize =
      std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
  // In case we require more memory than the current limit.
-  if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
+  if (DynBlockMemSize >= MaxDynBlockMemSize) {
    CUresult AttrResult = cuFuncSetAttribute(
-        Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
+        Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize);
    if (auto Err = Plugin::check(
            AttrResult,
            "error in cuFuncSetAttribute while setting the memory limits: %s"))
      return Err;
-    MaxDynCGroupMemLimit = MaxDynCGroupMem;
+    MaxDynBlockMemSize = DynBlockMemSize;
  }
  CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
                                NumThreads[0], NumThreads[1], NumThreads[2],
-                                MaxDynCGroupMem, Stream, nullptr, Config);
+                                DynBlockMemSize, Stream, nullptr, Config);
  // Register a callback to indicate when the kernel is complete.
  if (GenericDevice.getRPCServer())
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -97,8 +97,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
  /// Launch the kernel using the libffi.
  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
-                   KernelLaunchParamsTy LaunchParams,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
    if (!SupportsFFI)
      return Plugin::error(ErrorCode::UNSUPPORTED,
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -124,8 +124,8 @@ public:
  Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
  /// Launch the L0 kernel function.
  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
-                   KernelLaunchParamsTy LaunchParams,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
  Error deinit() {
    CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel);
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -413,9 +413,13 @@ Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
 Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                             uint32_t NumThreads[3], uint32_t NumBlocks[3],
-                             KernelArgsTy &KernelArgs,
+                             uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
                             KernelLaunchParamsTy LaunchParams,
                             AsyncInfoWrapperTy &AsyncInfoWrapper) const {
  if (DynBlockMemSize > 0)
    return Plugin::error(ErrorCode::UNSUPPORTED,
                         "dynamic shared memory is unsupported in L0 plugin");
  auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
  __tgt_async_info *AsyncInfo = AsyncInfoWrapper;
--- a/offload/test/offloading/dyn_groupprivate.cpp
+++ b/offload/test/offloading/dyn_groupprivate.cpp
@@ -0,0 +1,199 @@
 // RUN: %libomptarget-compilexx-generic -fopenmp-version=61
 // RUN: %libomptarget-run-generic | %fcheck-generic
 // RUN: %libomptarget-compileoptxx-generic -fopenmp-version=61
 // RUN: %libomptarget-run-generic | %fcheck-generic
 // REQUIRES: gpu
 // UNSUPPORTED: intelgpu
 #include <omp.h>
 #include <stdio.h>
 #define N 512
 int main() {
  int Result[N], NumThreads;
 // Verify the groupprivate buffer works as expected.
 #pragma omp target teams num_teams(1) thread_limit(N)                          \
    dyn_groupprivate(fallback(abort) : N * sizeof(Result[0]))                  \
    map(from : Result, NumThreads)
  {
    int Buffer[N];
 #pragma omp parallel
    {
      int *DynBuffer = (int *)omp_get_dyn_gprivate_nofb_ptr();
      int TId = omp_get_thread_num();
      if (TId == 0)
        NumThreads = omp_get_num_threads();
      Buffer[TId] = 7;
      DynBuffer[TId] = 3;
 #pragma omp barrier
      int WrappedTId = (TId + 37) % NumThreads;
      Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
    }
  }
  if (NumThreads < N / 2 || NumThreads > N) {
    printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
           NumThreads);
    return -1;
  }
  int Failed = 0;
  for (int i = 0; i < NumThreads; ++i) {
    if (Result[i] != 7 + 3) {
      printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
      ++Failed;
    }
  }
  // Verify that the routines in the host returns NULL and zero.
  if (omp_get_dyn_gprivate_ptr())
    ++Failed;
  if (omp_get_dyn_gprivate_nofb_ptr())
    ++Failed;
  if (omp_get_dyn_gprivate_size())
    ++Failed;
  size_t MaxSize = omp_get_gprivate_limit(0, omp_access_cgroup);
  size_t ExceededSize = MaxSize + 10;
 // Verify that the fallback(default_mem) modifier works.
 #pragma omp target dyn_groupprivate(fallback(default_mem) : ExceededSize)      \
    map(tofrom : Failed)
  {
    if (!omp_get_dyn_gprivate_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (!omp_get_dyn_gprivate_size())
      ++Failed;
    if (omp_get_dyn_gprivate_size() != ExceededSize)
      ++Failed;
    if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
      ++Failed;
  }
 // Verify that the fallback(null) modifier works.
 #pragma omp target dyn_groupprivate(fallback(null) : ExceededSize)             \
    map(tofrom : Failed)
  {
    if (omp_get_dyn_gprivate_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_size())
      ++Failed;
    if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
      ++Failed;
  }
 // Verify that the default modifier is fallback(default_mem).
 #pragma omp target dyn_groupprivate(ExceededSize)
  {
    if (!omp_get_dyn_gprivate_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (!omp_get_dyn_gprivate_size())
      ++Failed;
    if (omp_get_dyn_gprivate_size() != ExceededSize)
      ++Failed;
    if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
      ++Failed;
  }
 // Verify that the fallback(abort) modifier works.
 #pragma omp target dyn_groupprivate(fallback(abort) : N) map(tofrom : Failed)
  {
    if (!omp_get_dyn_gprivate_ptr(0))
      ++Failed;
    if (!omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(5) != omp_get_dyn_gprivate_nofb_ptr(5))
      ++Failed;
    if (!omp_get_dyn_gprivate_size())
      ++Failed;
    if (omp_get_dyn_gprivate_size() != N)
      ++Failed;
    if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
      ++Failed;
  }
 // Verify that the fallback(default_mem) does not trigger when not needed.
 #pragma omp target dyn_groupprivate(fallback(default_mem) : N)                 \
    map(tofrom : Failed)
  {
    if (!omp_get_dyn_gprivate_ptr(0))
      ++Failed;
    if (!omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (!omp_get_dyn_gprivate_size())
      ++Failed;
    if (omp_get_dyn_gprivate_size() != N)
      ++Failed;
    if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
      ++Failed;
  }
 // Verify that the clause works when passing a zero size.
 #pragma omp target dyn_groupprivate(fallback(abort) : 0) map(tofrom : Failed)
  {
    if (omp_get_dyn_gprivate_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_size())
      ++Failed;
    if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
      ++Failed;
  }
 // Verify that the clause works when passing a zero size and
 // fallback(default_mem).
 #pragma omp target dyn_groupprivate(fallback(default_mem) : 0)                 \
    map(tofrom : Failed)
  {
    if (omp_get_dyn_gprivate_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_size())
      ++Failed;
    if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
      ++Failed;
  }
 // Verify that omitting the clause is the same as setting zero size.
 #pragma omp target map(tofrom : Failed)
  {
    if (omp_get_dyn_gprivate_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
      ++Failed;
    if (omp_get_dyn_gprivate_size())
      ++Failed;
    if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
      ++Failed;
  }
  // CHECK: PASS
  if (!Failed)
    printf("PASS\n");
 }
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -171,9 +171,29 @@ typedef enum omp_allocator_handle_t {
  KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
 } omp_allocator_handle_t;
 typedef enum omp_memspace_handle_t {
  omp_null_mem_space = 0,
  omp_default_mem_space = 99,
  omp_large_cap_mem_space = 1,
  omp_const_mem_space = 2,
  omp_high_bw_mem_space = 3,
  omp_low_lat_mem_space = 4,
  omp_cgroup_mem_space = 5,
  KMP_MEMSPACE_MAX_HANDLE = ~(0LU)
 } omp_memspace_handle_t;
 #define __PRAGMA(STR) _Pragma(#STR)
 #define OMP_PRAGMA(STR) __PRAGMA(omp STR)
 ///}
 /// The OpenMP access group type. The criterion for grupping tasks using a
 /// specific grouping property.
 enum omp_access_t {
  /// Groups the tasks based on the contention group to which they belong.
  omp_access_cgroup = 0,
  /// Groups the tasks based on the parallel region to which they bind.
  omp_access_pteam = 1,
 };
 #endif
--- a/openmp/device/include/Interface.h
+++ b/openmp/device/include/Interface.h
@@ -226,7 +226,7 @@ struct KernelEnvironmentTy;
 int8_t __kmpc_is_spmd_exec_mode();
 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+                           KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
 void __kmpc_target_deinit();
--- a/openmp/device/include/State.h
+++ b/openmp/device/include/State.h
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
 /// Initialize the state machinery. Must be called by all threads.
 void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-          KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+          KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
 /// Return the kernel and kernel launch environment associated with the current
 /// kernel. The former is static and contains compile time information that
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -35,8 +35,8 @@ enum OMPTgtExecModeFlags : unsigned char {
 };
 static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                    KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                  KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
  // Order is important here.
  synchronize::init(IsSPMD);
  mapping::init(IsSPMD);
@@ -80,17 +80,17 @@ extern "C" {
 /// \param Ident               Source location identification, can be NULL.
 ///
 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                           KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
  ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
  bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
  bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
  if (IsSPMD) {
-    inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
+    initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
-                        KernelLaunchEnvironment);
+                      KernelLaunchEnvironment);
    synchronize::threadsAligned(atomic::relaxed);
  } else {
-    inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
+    initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
-                        KernelLaunchEnvironment);
+                      KernelLaunchEnvironment);
    // No need to wait since only the main threads will execute user
    // code and workers will run into a barrier right away.
  }
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -40,6 +40,10 @@ using namespace ompx;
 [[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
    KernelLaunchEnvironmentPtr;
 /// The pointer type for dynamic shared memory. This is important to keep
 /// the alignment and address space information.
 using SharedMemPtrTy = decltype(&DynamicSharedBuffer[0]);
 ///}
 namespace {
@@ -138,6 +142,60 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
  memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
 }
 /// Manager of the dynamic cgroup memory buffer.
 struct DynCGroupMemTy {
  /// Initialize the manager with the information from the kernel launch
  /// enviornment and the pointer to the native shared memory buffer.
  void init(KernelLaunchEnvironmentTy *KLE, SharedMemPtrTy NativePtr) {
    // Initialize default values.
    NativeOrNullPtr = nullptr;
    FallbackPtr = nullptr;
    Size = 0;
    Fallback = DynCGroupMemFallbackType::None;
    if (!KLE)
      return;
    // Initialize values using the kernel launch environment.
    Size = KLE->DynCGroupMemSize;
    Fallback = KLE->DynCGroupMemFb;
    if (Size && Fallback == DynCGroupMemFallbackType::None)
      NativeOrNullPtr = NativePtr;
    if (Fallback == DynCGroupMemFallbackType::DefaultMem)
      FallbackPtr = static_cast<unsigned char *>(KLE->DynCGroupMemFbPtr) +
                    Size * mapping::getBlockIdInKernel();
  }
  /// Get the memory space of the buffer.
  omp_memspace_handle_t getMemSpace() const {
    if (Size == 0)
      return omp_null_mem_space;
    if (Fallback == DynCGroupMemFallbackType::None)
      return omp_cgroup_mem_space;
    return omp_default_mem_space;
  }
  /// Get the size of the buffer.
  size_t getSize() const { return Size; }
  /// Get the native pointer or null if it was a fallback.
  SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; }
  /// Get the native pointer or the fallback pointer.
  unsigned char *getNativeOrFallbackPtr() const {
    return (Fallback == DynCGroupMemFallbackType::DefaultMem)
               ? FallbackPtr
               : getNativeOrNullPtr();
  }
 private:
  SharedMemPtrTy NativeOrNullPtr;
  unsigned char *FallbackPtr;
  size_t Size;
  DynCGroupMemFallbackType Fallback;
 };
 [[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
 } // namespace
 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
@@ -226,13 +284,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
 } // namespace
 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                 KernelLaunchEnvironmentTy *KLE) {
  SharedMemorySmartStack.init(IsSPMD);
  if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
    KLE = nullptr;
  if (mapping::isInitialThreadInLevel0(IsSPMD)) {
    DynCGroupMem.init(KLE, DynamicSharedBuffer);
    TeamState.init(IsSPMD);
    ThreadStates = nullptr;
    KernelEnvironmentPtr = &KernelEnvironment;
-    KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
+    KernelLaunchEnvironmentPtr = KLE;
  }
 }
@@ -416,6 +479,25 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 int omp_get_initial_device(void) { return -1; }
 int omp_is_initial_device(void) { return 0; }
 void *omp_get_dyn_gprivate_ptr(size_t Offset, omp_access_t) {
  return DynCGroupMem.getNativeOrFallbackPtr() + Offset;
 }
 void *omp_get_dyn_gprivate_nofb_ptr(size_t Offset, omp_access_t) {
  unsigned char *Ptr = DynCGroupMem.getNativeOrNullPtr();
  // Ensure the alignment and address space information is kept.
  Ptr = (unsigned char *)__builtin_assume_aligned(Ptr, allocator::ALIGNMENT);
  return (SharedMemPtrTy)(Ptr + Offset);
 }
 size_t omp_get_dyn_gprivate_size(omp_access_t) {
  return DynCGroupMem.getSize();
 }
 omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t) {
  return DynCGroupMem.getMemSpace();
 }
 }
 extern "C" {
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -607,6 +607,7 @@ kmp_set_disp_num_buffers                    890
    llvm_omp_target_shared_mem_space       DATA
    llvm_omp_target_device_mem_space       DATA
    omp_null_mem_space                     DATA
    omp_cgroup_mem_space                   DATA
 %ifndef stub
    # Ordinals between 900 and 999 are reserved
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -380,6 +380,11 @@
        omp_uintptr_t value;
    } omp_alloctrait_t;
    typedef enum {
        omp_access_cgroup = 0,
        omp_access_pteam = 1
    } omp_access_t;
 #   if defined(_WIN32)
    // On Windows cl and icl do not support 64-bit enum, let's use integer then.
    typedef omp_uintptr_t omp_allocator_handle_t;
@@ -403,6 +408,7 @@
    extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
    extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
    extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
    extern __KMP_IMP omp_memspace_handle_t const omp_cgroup_mem_space;
    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
    extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
@@ -439,6 +445,7 @@
      omp_const_mem_space = 2,
      omp_high_bw_mem_space = 3,
      omp_low_lat_mem_space = 4,
      omp_cgroup_mem_space = 5,
      llvm_omp_target_host_mem_space = 100,
      llvm_omp_target_shared_mem_space = 101,
      llvm_omp_target_device_mem_space = 102,
@@ -463,6 +470,11 @@
                                                   omp_allocator_handle_t allocator = omp_null_allocator,
                                                   omp_allocator_handle_t free_allocator = omp_null_allocator);
    extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
    extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
    extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
    extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group = omp_access_cgroup);
    extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group = omp_access_cgroup);
    extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
 #   else
    extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
    extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
@@ -473,6 +485,11 @@
    extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
                                                   omp_allocator_handle_t free_allocator);
    extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
    extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group);
    extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group);
    extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group);
    extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group);
    extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group);
 #   endif
    /* OpenMP TR11 routines to get memory spaces and allocators */
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1072,6 +1072,7 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space;
 extern omp_memspace_handle_t const omp_const_mem_space;
 extern omp_memspace_handle_t const omp_high_bw_mem_space;
 extern omp_memspace_handle_t const omp_low_lat_mem_space;
 extern omp_memspace_handle_t const omp_cgroup_mem_space;
 extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
 extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
 extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4515,6 +4515,20 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
 }
 /* end of OpenMP 5.1 Memory Management routines */
 void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
  return NULL;
 }
 void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
  return NULL;
 }
 size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { return 0; }
 omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
  return omp_null_mem_space;
 }
 int __kmpc_get_target_offload(void) {
  if (!__kmp_init_serial) {
    __kmp_serial_initialize();
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -333,6 +333,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
    (omp_memspace_handle_t const)3;
 omp_memspace_handle_t const omp_low_lat_mem_space =
    (omp_memspace_handle_t const)4;
 omp_memspace_handle_t const omp_cgroup_mem_space =
    (omp_memspace_handle_t const)5;
 omp_memspace_handle_t const llvm_omp_target_host_mem_space =
    (omp_memspace_handle_t const)100;
 omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -368,6 +368,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
    (omp_memspace_handle_t const)3;
 omp_memspace_handle_t const omp_low_lat_mem_space =
    (omp_memspace_handle_t const)4;
 omp_memspace_handle_t const omp_cgroup_mem_space =
    (omp_memspace_handle_t const)5;
 omp_memspace_handle_t const llvm_omp_target_host_mem_space =
    (omp_memspace_handle_t const)100;
 omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
@@ -454,6 +456,31 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
 #endif
 }
 void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
  i;
  return NULL;
 }
 void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
  i;
  return NULL;
 }
 size_t omp_get_dyn_gprivate_size(omp_access_t access_group) {
  i;
  return 0;
 }
 omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
  i;
  return omp_null_mem_space;
 }
 size_t omp_get_gprivate_limit(int device_num, omp_access_t access_group) {
  i;
  return 0;
 }
 /* OpenMP 5.0 Affinity Format */
 void omp_set_affinity_format(char const *format) { i; }
 size_t omp_get_affinity_format(char *buffer, size_t size) {