llvm-project/offload/plugins-nextgen/level_zero/include/L0Memory.h

//===--- Level Zero Target RTL Implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Memory related support for SPIR-V/Xe machine.
//
//===----------------------------------------------------------------------===//

#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H

#include <cassert>
#include <level_zero/ze_api.h>
#include <list>
#include <map>
#include <memory>
#include <mutex>

#include "L0Defs.h"
#include "L0Trace.h"

namespace llvm::omp::target::plugin {

class L0DeviceTy;

// Forward declarations.
struct L0OptionsTy;
class L0DeviceTy;
class L0ContextTy;

constexpr static int32_t MaxMemKind = TARGET_ALLOC_LAST + 1;

struct DynamicMemHeapTy {
  /// Base address memory is allocated from.
  uintptr_t AllocBase = 0;
  /// Minimal size served by the current heap.
  size_t BlockSize = 0;
  /// Max size served by the current heap.
  size_t MaxSize = 0;
  /// Available memory blocks.
  uint32_t NumBlocks = 0;
  /// Number of block descriptors.
  uint32_t NumBlockDesc = 0;
  /// Number of block counters.
  uint32_t NumBlockCounter = 0;
  /// List of memory block descriptors.
  uint64_t *BlockDesc = nullptr;
  /// List of memory block counters.
  uint32_t *BlockCounter = nullptr;
};

struct DynamicMemPoolTy {
  /// Location of device memory blocks.
  void *PoolBase = nullptr;
  /// Heap size common to all heaps.
  size_t HeapSize = 0;
  /// Number of heaps available.
  uint32_t NumHeaps = 0;
  /// Heap descriptors (using fixed-size array to simplify memory allocation).
  DynamicMemHeapTy HeapDesc[8];
};

/// Memory allocation information used in memory allocation/deallocation.
struct MemAllocInfoTy {
  /// Base address allocated from compute runtime.
  void *Base = nullptr;
  /// Allocation size known to users/libomptarget.
  size_t ReqSize = 0;
  /// Allocation size known to the plugin (can be larger than ReqSize).
  size_t AllocSize = 0;
  /// TARGET_ALLOC kind.
  int32_t Kind = TARGET_ALLOC_DEFAULT;
  /// Is the allocation from a pool?
  bool InPool = false;
  /// Is an implicit argument?
  bool ImplicitArg = false;

  MemAllocInfoTy() = default;

  MemAllocInfoTy(void *Base, size_t ReqSize, size_t AllocSize, int32_t Kind,
                 bool InPool, bool ImplicitArg)
      : Base(Base), ReqSize(ReqSize), AllocSize(AllocSize), Kind(Kind),
        InPool(InPool), ImplicitArg(ImplicitArg) {}
};

/// Responsible for all activities involving memory allocation/deallocation.
/// It contains memory pool management, memory allocation bookkeeping.
class MemAllocatorTy {

  /// Simple memory allocation statistics. Maintains numbers for pool allocation
  /// and GPU RT allocation.
  struct MemStatTy {
    size_t Requested[2] = {0, 0}; // Requested bytes.
    size_t Allocated[2] = {0, 0}; // Allocated bytes.
    size_t Freed[2] = {0, 0};     // Freed bytes.
    size_t InUse[2] = {0, 0};     // Current memory in use.
    size_t PeakUse[2] = {0, 0};   // Peak bytes used.
    size_t NumAllocs[2] = {0, 0}; // Number of allocations.
  };

  /// Memory pool which enables reuse of already allocated blocks:
  /// -- Pool maintains a list of buckets each of which can allocate fixed-size
  ///    memory.
  /// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
  /// -- Each memory block can allocate multiple fixed-size memory requested by
  ///    offload RT or user.
  /// -- Memory allocation falls back to GPU RT allocation when the pool size
  ///    (total memory used by pool) reaches a threshold.
  class MemPoolTy {

    /// Memory block maintained in each bucket.
    struct BlockTy {
      /// Base address of this block.
      uintptr_t Base = 0;
      /// Size of the block.
      size_t Size = 0;
      /// Supported allocation size by this block.
      size_t ChunkSize = 0;
      /// Total number of slots.
      uint32_t NumSlots = 0;
      /// Maximum slot value.
      static constexpr uint32_t MaxSlots =
          std::numeric_limits<decltype(NumSlots)>::max();
      /// Number of slots in use.
      uint32_t NumUsedSlots = 0;
      /// Cached available slot returned by the last dealloc() call.
      uint32_t FreeSlot = MaxSlots;
      /// Marker for the currently used slots.
      std::vector<bool> UsedSlots;

      BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
        Base = reinterpret_cast<uintptr_t>(_Base);
        Size = _Size;
        ChunkSize = _ChunkSize;
        NumSlots = Size / ChunkSize;
        NumUsedSlots = 0;
        UsedSlots.resize(NumSlots, /*InitValue=*/false);
      }

      /// Check if the current block is fully used.
      bool isFull() const { return NumUsedSlots == NumSlots; }

      /// Check if the given address belongs to the current block.
      bool contains(void *Mem) const {
        auto M = reinterpret_cast<uintptr_t>(Mem);
        return M >= Base && M < Base + Size;
      }

      /// Allocate a single chunk from the block.
      void *alloc();

      /// Deallocate the given memory.
      void dealloc(void *Mem);
    }; // BlockTy

    /// Allocation kind for the current pool.
    int32_t AllocKind = TARGET_ALLOC_DEFAULT;
    /// Access to the allocator.
    MemAllocatorTy *Allocator = nullptr;
    /// Minimum supported memory allocation size from pool.
    size_t AllocMin = 1 << 6; // 64B
    /// Maximum supported memory allocation size from pool.
    size_t AllocMax = 0;
    /// Allocation size when the pool needs to allocate a block.
    size_t AllocUnit = 1 << 16; // 64KB
    /// Capacity of each block in the buckets which decides number of
    /// allocatable chunks from the block. Each block in the bucket can serve
    /// at least BlockCapacity chunks.
    /// If ChunkSize * BlockCapacity <= AllocUnit
    ///   BlockSize = AllocUnit
    /// Otherwise,
    ///   BlockSize = ChunkSize * BlockCapacity
    /// This simply means how much memory is over-allocated.
    uint32_t BlockCapacity = 0;
    /// Total memory allocated from GPU RT for this pool.
    size_t PoolSize = 0;
    /// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
    /// when PoolSize reaches PoolSizeMax.
    size_t PoolSizeMax = 0;
    /// Small allocation size allowed in the pool even if pool size is over the
    /// pool size limit.
    size_t SmallAllocMax = 1024;
    /// Small allocation pool size.
    size_t SmallPoolSize = 0;
    /// Small allocation pool size max (4MB).
    size_t SmallPoolSizeMax = (4 << 20);
    /// List of buckets.
    std::vector<std::vector<BlockTy *>> Buckets;
    /// List of bucket parameters.
    std::vector<std::pair<size_t, size_t>> BucketParams;
    /// Map from allocated pointer to corresponding block.
    llvm::DenseMap<void *, BlockTy *> PtrToBlock;
    /// Simple stats counting miss/hit in each bucket.
    std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
    /// Need to zero-initialize after L0 allocation.
    bool ZeroInit = false;

    /// Get bucket ID from the specified allocation size.
    uint32_t getBucketId(size_t Size) {
      uint32_t Count = 0;
      for (size_t SZ = AllocMin; SZ < Size; Count++)
        SZ <<= 1;
      return Count;
    }

  public:
    MemPoolTy() = default;
    MemPoolTy(const MemPoolTy &) = delete;
    MemPoolTy(MemPoolTy &&) = delete;
    MemPoolTy &operator=(const MemPoolTy &) = delete;
    MemPoolTy &operator=(const MemPoolTy &&) = delete;
    ~MemPoolTy() = default;

    void printUsage();

    /// Initialize pool with allocation kind, allocator, and user options.
    Error init(int32_t Kind, MemAllocatorTy *Allocator,
               const L0OptionsTy &Option);
    // Initialize pool used for reduction pool.
    Error init(MemAllocatorTy *Allocator, const L0OptionsTy &Option);
    // Initialize pool used for small memory pool with fixed parameters.
    Error init(MemAllocatorTy *Allocator);

    /// Release resources used in the pool.
    Error deinit();

    /// Allocate the requested size of memory from this pool.
    /// AllocSize is the chunk size internally used for the returned memory.
    Expected<void *> alloc(size_t Size, size_t &AllocSize);
    /// Deallocate the specified memory and returns block size deallocated.
    size_t dealloc(void *Ptr);
  }; // MemPoolTy

  /// Allocation information maintained in the plugin.
  class MemAllocInfoMapTy {
    /// Map from allocated pointer to allocation information.
    std::map<void *, MemAllocInfoTy> Map;
    /// Map from target alloc kind to number of implicit arguments.
    std::array<uint32_t, MaxMemKind> NumImplicitArgs;

  public:
    /// Add allocation information to the map.
    void add(void *Ptr, void *Base, size_t ReqSize, size_t AllocSize,
             int32_t Kind, bool InPool = false, bool ImplicitArg = false);

    /// Remove allocation information for the given memory location.
    bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);

    /// Finds allocation information for the given memory location.
    const MemAllocInfoTy *find(void *Ptr) const {
      auto AllocInfo = Map.find(Ptr);
      if (AllocInfo == Map.end())
        return nullptr;
      else
        return &AllocInfo->second;
    }

    /// Check if the map contains the given pointer and offset.
    bool contains(const void *Ptr, size_t Size) const {
      if (Map.size() == 0)
        return false;
      auto I = Map.upper_bound(const_cast<void *>(Ptr));
      if (I == Map.begin())
        return false;
      --I;

      uintptr_t PtrAsInt = reinterpret_cast<uintptr_t>(Ptr);
      uintptr_t MapBase = reinterpret_cast<uintptr_t>(I->first);
      uintptr_t MapSize = static_cast<uintptr_t>(I->second.ReqSize);

      bool Ret = MapBase <= PtrAsInt && PtrAsInt + Size <= MapBase + MapSize;
      return Ret;
    }

    /// Returns the number of implicit arguments for the specified allocation
    /// kind.
    size_t getNumImplicitArgs(int32_t Kind) {
      assert(Kind >= 0 && Kind < MaxMemKind &&
             "Invalid target allocation kind");
      return NumImplicitArgs[Kind];
    }
  }; // MemAllocInfoMapTy

  /// L0 context to use.
  const L0ContextTy *L0Context = nullptr;
  /// L0 device to use.
  L0DeviceTy *Device = nullptr;
  /// Whether the device supports large memory allocation.
  bool SupportsLargeMem = false;
  /// Cached max alloc size supported by device.
  uint64_t MaxAllocSize;
  /// Map from allocation kind to memory statistics.
  std::array<MemStatTy, MaxMemKind> Stats;
  /// Map from allocation kind to memory pool.
  std::array<std::unique_ptr<MemPoolTy>, MaxMemKind> Pools;

  /// Memory pool dedicated to reduction scratch space.
  std::unique_ptr<MemPoolTy> ReductionPool;
  /// Memory pool dedicated to reduction counters.
  std::unique_ptr<MemPoolTy> CounterPool;
  /// Allocation information map.
  MemAllocInfoMapTy AllocInfo;
  /// RTL-owned memory that needs to be freed automatically.
  std::vector<void *> MemOwned;
  /// Lock protection.
  std::mutex Mtx;
  /// Allocator only supports host memory.
  bool IsHostMem = false;
  // Internal deallocation function to be called when already
  // hondling the Mtx lock.
  Error deallocLocked(void *Ptr);

  /// Allocate memory from L0 GPU RT.
  Expected<void *> allocFromL0(size_t Size, size_t Align, int32_t Kind);
  /// Deallocate memory from L0 GPU RT.
  Error deallocFromL0(void *Ptr);

  /// We use over-allocation workaround to support target pointer with
  /// offset, and positive "ActiveSize" is specified in such cases to
  /// correct debug logging.
  Expected<void *> allocFromL0AndLog(size_t Size, size_t Align, int32_t Kind,
                                     size_t ActiveSize = 0) {
    auto MemOrErr = allocFromL0(Size, Align, Kind);
    if (!MemOrErr)
      return MemOrErr;
    size_t LoggedSize = ActiveSize ? ActiveSize : Size;
    log(LoggedSize, Size, Kind);
    return MemOrErr;
  }

  /// Log memory allocation/deallocation.
  void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
    if (Kind < 0 || Kind >= MaxMemKind)
      return; // Stat is disabled.

    auto &ST = Stats[Kind];
    int32_t I = Pool ? 1 : 0;
    if (ReqSize > 0) {
      ST.Requested[I] += ReqSize;
      ST.Allocated[I] += Size;
      ST.InUse[I] += Size;
      ST.NumAllocs[I]++;
    } else {
      ST.Freed[I] += Size;
      ST.InUse[I] -= Size;
    }
    ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
  }

  /// Perform copy operation.
  Error enqueueMemCopy(void *Dst, const void *Src, size_t Size);
  /// Perform memory fill operation.
  Error enqueueMemSet(void *Dst, int8_t Value, size_t Size);

  /// Allocate memory with the specified information from a memory pool.
  Expected<void *> allocFromPool(size_t Size, size_t Align, int32_t Kind,
                                 intptr_t Offset, bool UserAlloc,
                                 bool DevMalloc, uint32_t MemAdvice,
                                 AllocOptionTy AllocOpt);
  /// Deallocate memory from memory pool.
  Error deallocFromPool(void *Ptr) {
    std::lock_guard<std::mutex> Lock(Mtx);
    return deallocLocked(Ptr);
  }

public:
  MemAllocatorTy()
      : MaxAllocSize(std::numeric_limits<decltype(MaxAllocSize)>::max()) {}

  MemAllocatorTy(const MemAllocatorTy &) = delete;
  MemAllocatorTy(MemAllocatorTy &&) = delete;
  MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
  MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
  ~MemAllocatorTy() = default;

  Error initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
  Error initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
  void updateMaxAllocSize(L0DeviceTy &L0Device);

  /// Release resources and report statistics if requested.
  Error deinit();

  /// Allocate memory with the specified information from a memory pool.
  Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
                         intptr_t Offset, bool UserAlloc, bool DevMalloc,
                         uint32_t MemAdvice, AllocOptionTy AllocOpt) {
    return allocFromPool(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
                         MemAdvice, AllocOpt);
  }

  /// Deallocate memory.
  Error dealloc(void *Ptr) { return deallocFromPool(Ptr); }

  /// Check if the given memory location and offset belongs to any allocated
  /// memory.
  bool contains(const void *Ptr, size_t Size) {
    std::lock_guard<std::mutex> Lock(Mtx);
    return AllocInfo.contains(Ptr, Size);
  }

  /// Get allocation information for the specified memory location.
  const MemAllocInfoTy *getAllocInfo(void *Ptr) {
    std::lock_guard<std::mutex> Lock(Mtx);
    return AllocInfo.find(Ptr);
  }

  /// Get kernel indirect access flags using implicit argument info.
  ze_kernel_indirect_access_flags_t getIndirectFlags() {
    std::lock_guard<std::mutex> Lock(Mtx);
    ze_kernel_indirect_access_flags_t Ret = 0;
    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
    if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
      Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
    return Ret;
  }
}; /// MemAllocatorTy

// Simple generic wrapper to reuse objects
// objects must have zero argument accessible constructor.
template <class ObjTy> class ObjPool {
  // Protection.
  std::unique_ptr<std::mutex> Mtx;
  // List of Objects.
  std::list<ObjTy *> Objects;

public:
  ObjPool() { Mtx.reset(new std::mutex); }

  ObjPool(const ObjPool &) = delete;
  ObjPool(ObjPool &) = delete;
  ObjPool &operator=(const ObjPool &) = delete;
  ObjPool &operator=(const ObjPool &&) = delete;

  ObjTy *get() {
    if (!Objects.empty()) {
      std::lock_guard<std::mutex> Lock(*Mtx);
      if (!Objects.empty()) {
        const auto Ret = Objects.back();
        Objects.pop_back();
        return Ret;
      }
    }
    return new ObjTy();
  }

  void release(ObjTy *obj) {
    std::lock_guard<std::mutex> Lock(*Mtx);
    Objects.push_back(obj);
  }

  ~ObjPool() {
    for (auto Object : Objects)
      delete Object;
  }
};

/// Common event pool used in the plugin. This event pool assumes all events
/// from the pool are host-visible and use the same event pool flag.
class EventPoolTy {
  /// Size of L0 event pool created on demand.
  size_t PoolSize = 64;

  /// Context of the events.
  ze_context_handle_t Context = nullptr;

  /// Additional event pool flags common to this pull.
  uint32_t Flags = 0;

  /// Protection.
  std::unique_ptr<std::mutex> Mtx;

  /// List of created L0 event pools.
  std::list<ze_event_pool_handle_t> Pools;

  /// List of free L0 events.
  std::list<ze_event_handle_t> Events;

#ifdef OMPT_SUPPORT
  /// Event to OMPT record map. The timestamp information is recorded to the
  /// OMPT record before the event is recycled.
  std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
#endif // OMPT_SUPPORT

public:
  /// Initialize context, flags, and mutex.
  Error init(ze_context_handle_t ContextIn, uint32_t FlagsIn) {
    Context = ContextIn;
    Flags = FlagsIn;
    Mtx.reset(new std::mutex);
    return Plugin::success();
  }

  /// Destroys L0 resources.
  Error deinit() {
    for (auto E : Events)
      CALL_ZE_RET_ERROR(zeEventDestroy, E);
    for (auto P : Pools)
      CALL_ZE_RET_ERROR(zeEventPoolDestroy, P);
    return Plugin::success();
  }

  /// Get a free event from the pool.
  Expected<ze_event_handle_t> getEvent();

  /// Return an event to the pool.
  Error releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
};

/// Staging buffer.
/// A single staging buffer is not enough when batching is enabled since there
/// can be multiple pending copy operations.
class StagingBufferTy {
  /// Context for L0 calls.
  ze_context_handle_t Context = nullptr;
  /// Max allowed size for staging buffer.
  size_t Size = L0StagingBufferSize;
  /// Number of buffers allocated together.
  size_t Count = L0StagingBufferCount;
  /// Buffers increasing by Count if a new buffer is required.
  llvm::SmallVector<void *> Buffers;
  /// Next buffer location in the buffers.
  size_t Offset = 0;

  Expected<void *> addBuffers() {
    ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
                                       nullptr, 0};
    void *Ret = nullptr;
    size_t AllocSize = Size * Count;
    CALL_ZE_RET_ERROR(zeMemAllocHost, Context, &AllocDesc, AllocSize,
                      L0DefaultAlignment, &Ret);
    Buffers.push_back(Ret);
    return Ret;
  }

public:
  StagingBufferTy() = default;
  StagingBufferTy(const StagingBufferTy &) = delete;
  StagingBufferTy(StagingBufferTy &&) = delete;
  StagingBufferTy &operator=(const StagingBufferTy &) = delete;
  StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
  ~StagingBufferTy() = default;

  Error clear() {
    for (auto Ptr : Buffers)
      CALL_ZE_RET_ERROR(zeMemFree, Context, Ptr);
    Context = nullptr;
    return Plugin::success();
  }

  bool initialized() const { return Context != nullptr; }

  void init(ze_context_handle_t ContextIn, size_t SizeIn, size_t CountIn) {
    Context = ContextIn;
    Size = SizeIn;
    Count = CountIn;
  }

  void reset() { Offset = 0; }

  /// Always return the first buffer.
  Expected<void *> get() {
    if (Size == 0 || Count == 0)
      return nullptr;
    return Buffers.empty() ? addBuffers() : Buffers.front();
  }

  /// Return the next available buffer.
  Expected<void *> getNext() {
    void *Ret = nullptr;
    if (Size == 0 || Count == 0)
      return Ret;

    size_t AllocSize = Size * Count;
    bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
    if (NeedToGrow) {
      auto PtrOrErr = addBuffers();
      if (!PtrOrErr)
        return PtrOrErr.takeError();
      Ret = *PtrOrErr;
    } else
      Ret = reinterpret_cast<void *>(
          reinterpret_cast<uintptr_t>(Buffers.back()) + (Offset % AllocSize));

    if (!Ret)
      return nullptr;

    Offset += Size;
    return Ret;
  }

  /// Return either a fixed buffer or next buffer.
  Expected<void *> get(bool Next) { return Next ? getNext() : get(); }
};

} // namespace llvm::omp::target::plugin

#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H