Add a new nextgen plugin that supports GPU devices through the Intel oneAPI Level Zero library. The plugin is not enabled by default and needs to be added to LIBOMPTARGET_PLUGINS_TO_BUILD explicitely. --------- Co-authored-by: Alexey Sachkov <alexey.sachkov@intel.com> Co-authored-by: Nick Sarnie <nick.sarnie@intel.com> Co-authored-by: Joseph Huber <huberjn@outlook.com>
605 lines
20 KiB
C++
605 lines
20 KiB
C++
//===--- Level Zero Target RTL Implementation -----------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Memory related support for SPIR-V/Xe machine.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
|
|
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
|
|
|
|
#include <cassert>
|
|
#include <level_zero/ze_api.h>
|
|
#include <list>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <mutex>
|
|
|
|
#include "L0Defs.h"
|
|
#include "L0Trace.h"
|
|
|
|
namespace llvm::omp::target::plugin {
|
|
|
|
class L0DeviceTy;
|
|
|
|
// Forward declarations.
|
|
struct L0OptionsTy;
|
|
class L0DeviceTy;
|
|
class L0ContextTy;
|
|
|
|
constexpr static int32_t MaxMemKind = TARGET_ALLOC_LAST + 1;
|
|
|
|
struct DynamicMemHeapTy {
|
|
/// Base address memory is allocated from.
|
|
uintptr_t AllocBase = 0;
|
|
/// Minimal size served by the current heap.
|
|
size_t BlockSize = 0;
|
|
/// Max size served by the current heap.
|
|
size_t MaxSize = 0;
|
|
/// Available memory blocks.
|
|
uint32_t NumBlocks = 0;
|
|
/// Number of block descriptors.
|
|
uint32_t NumBlockDesc = 0;
|
|
/// Number of block counters.
|
|
uint32_t NumBlockCounter = 0;
|
|
/// List of memory block descriptors.
|
|
uint64_t *BlockDesc = nullptr;
|
|
/// List of memory block counters.
|
|
uint32_t *BlockCounter = nullptr;
|
|
};
|
|
|
|
struct DynamicMemPoolTy {
|
|
/// Location of device memory blocks.
|
|
void *PoolBase = nullptr;
|
|
/// Heap size common to all heaps.
|
|
size_t HeapSize = 0;
|
|
/// Number of heaps available.
|
|
uint32_t NumHeaps = 0;
|
|
/// Heap descriptors (using fixed-size array to simplify memory allocation).
|
|
DynamicMemHeapTy HeapDesc[8];
|
|
};
|
|
|
|
/// Memory allocation information used in memory allocation/deallocation.
|
|
struct MemAllocInfoTy {
|
|
/// Base address allocated from compute runtime.
|
|
void *Base = nullptr;
|
|
/// Allocation size known to users/libomptarget.
|
|
size_t ReqSize = 0;
|
|
/// Allocation size known to the plugin (can be larger than ReqSize).
|
|
size_t AllocSize = 0;
|
|
/// TARGET_ALLOC kind.
|
|
int32_t Kind = TARGET_ALLOC_DEFAULT;
|
|
/// Is the allocation from a pool?
|
|
bool InPool = false;
|
|
/// Is an implicit argument?
|
|
bool ImplicitArg = false;
|
|
|
|
MemAllocInfoTy() = default;
|
|
|
|
MemAllocInfoTy(void *Base, size_t ReqSize, size_t AllocSize, int32_t Kind,
|
|
bool InPool, bool ImplicitArg)
|
|
: Base(Base), ReqSize(ReqSize), AllocSize(AllocSize), Kind(Kind),
|
|
InPool(InPool), ImplicitArg(ImplicitArg) {}
|
|
};
|
|
|
|
/// Responsible for all activities involving memory allocation/deallocation.
|
|
/// It contains memory pool management, memory allocation bookkeeping.
|
|
class MemAllocatorTy {
|
|
|
|
/// Simple memory allocation statistics. Maintains numbers for pool allocation
|
|
/// and GPU RT allocation.
|
|
struct MemStatTy {
|
|
size_t Requested[2] = {0, 0}; // Requested bytes.
|
|
size_t Allocated[2] = {0, 0}; // Allocated bytes.
|
|
size_t Freed[2] = {0, 0}; // Freed bytes.
|
|
size_t InUse[2] = {0, 0}; // Current memory in use.
|
|
size_t PeakUse[2] = {0, 0}; // Peak bytes used.
|
|
size_t NumAllocs[2] = {0, 0}; // Number of allocations.
|
|
};
|
|
|
|
/// Memory pool which enables reuse of already allocated blocks:
|
|
/// -- Pool maintains a list of buckets each of which can allocate fixed-size
|
|
/// memory.
|
|
/// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
|
|
/// -- Each memory block can allocate multiple fixed-size memory requested by
|
|
/// offload RT or user.
|
|
/// -- Memory allocation falls back to GPU RT allocation when the pool size
|
|
/// (total memory used by pool) reaches a threshold.
|
|
class MemPoolTy {
|
|
|
|
/// Memory block maintained in each bucket.
|
|
struct BlockTy {
|
|
/// Base address of this block.
|
|
uintptr_t Base = 0;
|
|
/// Size of the block.
|
|
size_t Size = 0;
|
|
/// Supported allocation size by this block.
|
|
size_t ChunkSize = 0;
|
|
/// Total number of slots.
|
|
uint32_t NumSlots = 0;
|
|
/// Maximum slot value.
|
|
static constexpr uint32_t MaxSlots =
|
|
std::numeric_limits<decltype(NumSlots)>::max();
|
|
/// Number of slots in use.
|
|
uint32_t NumUsedSlots = 0;
|
|
/// Cached available slot returned by the last dealloc() call.
|
|
uint32_t FreeSlot = MaxSlots;
|
|
/// Marker for the currently used slots.
|
|
std::vector<bool> UsedSlots;
|
|
|
|
BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
|
|
Base = reinterpret_cast<uintptr_t>(_Base);
|
|
Size = _Size;
|
|
ChunkSize = _ChunkSize;
|
|
NumSlots = Size / ChunkSize;
|
|
NumUsedSlots = 0;
|
|
UsedSlots.resize(NumSlots, /*InitValue=*/false);
|
|
}
|
|
|
|
/// Check if the current block is fully used.
|
|
bool isFull() const { return NumUsedSlots == NumSlots; }
|
|
|
|
/// Check if the given address belongs to the current block.
|
|
bool contains(void *Mem) const {
|
|
auto M = reinterpret_cast<uintptr_t>(Mem);
|
|
return M >= Base && M < Base + Size;
|
|
}
|
|
|
|
/// Allocate a single chunk from the block.
|
|
void *alloc();
|
|
|
|
/// Deallocate the given memory.
|
|
void dealloc(void *Mem);
|
|
}; // BlockTy
|
|
|
|
/// Allocation kind for the current pool.
|
|
int32_t AllocKind = TARGET_ALLOC_DEFAULT;
|
|
/// Access to the allocator.
|
|
MemAllocatorTy *Allocator = nullptr;
|
|
/// Minimum supported memory allocation size from pool.
|
|
size_t AllocMin = 1 << 6; // 64B
|
|
/// Maximum supported memory allocation size from pool.
|
|
size_t AllocMax = 0;
|
|
/// Allocation size when the pool needs to allocate a block.
|
|
size_t AllocUnit = 1 << 16; // 64KB
|
|
/// Capacity of each block in the buckets which decides number of
|
|
/// allocatable chunks from the block. Each block in the bucket can serve
|
|
/// at least BlockCapacity chunks.
|
|
/// If ChunkSize * BlockCapacity <= AllocUnit
|
|
/// BlockSize = AllocUnit
|
|
/// Otherwise,
|
|
/// BlockSize = ChunkSize * BlockCapacity
|
|
/// This simply means how much memory is over-allocated.
|
|
uint32_t BlockCapacity = 0;
|
|
/// Total memory allocated from GPU RT for this pool.
|
|
size_t PoolSize = 0;
|
|
/// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
|
|
/// when PoolSize reaches PoolSizeMax.
|
|
size_t PoolSizeMax = 0;
|
|
/// Small allocation size allowed in the pool even if pool size is over the
|
|
/// pool size limit.
|
|
size_t SmallAllocMax = 1024;
|
|
/// Small allocation pool size.
|
|
size_t SmallPoolSize = 0;
|
|
/// Small allocation pool size max (4MB).
|
|
size_t SmallPoolSizeMax = (4 << 20);
|
|
/// List of buckets.
|
|
std::vector<std::vector<BlockTy *>> Buckets;
|
|
/// List of bucket parameters.
|
|
std::vector<std::pair<size_t, size_t>> BucketParams;
|
|
/// Map from allocated pointer to corresponding block.
|
|
llvm::DenseMap<void *, BlockTy *> PtrToBlock;
|
|
/// Simple stats counting miss/hit in each bucket.
|
|
std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
|
|
/// Need to zero-initialize after L0 allocation.
|
|
bool ZeroInit = false;
|
|
|
|
/// Get bucket ID from the specified allocation size.
|
|
uint32_t getBucketId(size_t Size) {
|
|
uint32_t Count = 0;
|
|
for (size_t SZ = AllocMin; SZ < Size; Count++)
|
|
SZ <<= 1;
|
|
return Count;
|
|
}
|
|
|
|
public:
|
|
MemPoolTy() = default;
|
|
MemPoolTy(const MemPoolTy &) = delete;
|
|
MemPoolTy(MemPoolTy &&) = delete;
|
|
MemPoolTy &operator=(const MemPoolTy &) = delete;
|
|
MemPoolTy &operator=(const MemPoolTy &&) = delete;
|
|
~MemPoolTy() = default;
|
|
|
|
void printUsage();
|
|
|
|
/// Initialize pool with allocation kind, allocator, and user options.
|
|
Error init(int32_t Kind, MemAllocatorTy *Allocator,
|
|
const L0OptionsTy &Option);
|
|
// Initialize pool used for reduction pool.
|
|
Error init(MemAllocatorTy *Allocator, const L0OptionsTy &Option);
|
|
// Initialize pool used for small memory pool with fixed parameters.
|
|
Error init(MemAllocatorTy *Allocator);
|
|
|
|
/// Release resources used in the pool.
|
|
Error deinit();
|
|
|
|
/// Allocate the requested size of memory from this pool.
|
|
/// AllocSize is the chunk size internally used for the returned memory.
|
|
Expected<void *> alloc(size_t Size, size_t &AllocSize);
|
|
/// Deallocate the specified memory and returns block size deallocated.
|
|
size_t dealloc(void *Ptr);
|
|
}; // MemPoolTy
|
|
|
|
/// Allocation information maintained in the plugin.
|
|
class MemAllocInfoMapTy {
|
|
/// Map from allocated pointer to allocation information.
|
|
std::map<void *, MemAllocInfoTy> Map;
|
|
/// Map from target alloc kind to number of implicit arguments.
|
|
std::array<uint32_t, MaxMemKind> NumImplicitArgs;
|
|
|
|
public:
|
|
/// Add allocation information to the map.
|
|
void add(void *Ptr, void *Base, size_t ReqSize, size_t AllocSize,
|
|
int32_t Kind, bool InPool = false, bool ImplicitArg = false);
|
|
|
|
/// Remove allocation information for the given memory location.
|
|
bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
|
|
|
|
/// Finds allocation information for the given memory location.
|
|
const MemAllocInfoTy *find(void *Ptr) const {
|
|
auto AllocInfo = Map.find(Ptr);
|
|
if (AllocInfo == Map.end())
|
|
return nullptr;
|
|
else
|
|
return &AllocInfo->second;
|
|
}
|
|
|
|
/// Check if the map contains the given pointer and offset.
|
|
bool contains(const void *Ptr, size_t Size) const {
|
|
if (Map.size() == 0)
|
|
return false;
|
|
auto I = Map.upper_bound(const_cast<void *>(Ptr));
|
|
if (I == Map.begin())
|
|
return false;
|
|
--I;
|
|
|
|
uintptr_t PtrAsInt = reinterpret_cast<uintptr_t>(Ptr);
|
|
uintptr_t MapBase = reinterpret_cast<uintptr_t>(I->first);
|
|
uintptr_t MapSize = static_cast<uintptr_t>(I->second.ReqSize);
|
|
|
|
bool Ret = MapBase <= PtrAsInt && PtrAsInt + Size <= MapBase + MapSize;
|
|
return Ret;
|
|
}
|
|
|
|
/// Returns the number of implicit arguments for the specified allocation
|
|
/// kind.
|
|
size_t getNumImplicitArgs(int32_t Kind) {
|
|
assert(Kind >= 0 && Kind < MaxMemKind &&
|
|
"Invalid target allocation kind");
|
|
return NumImplicitArgs[Kind];
|
|
}
|
|
}; // MemAllocInfoMapTy
|
|
|
|
/// L0 context to use.
|
|
const L0ContextTy *L0Context = nullptr;
|
|
/// L0 device to use.
|
|
L0DeviceTy *Device = nullptr;
|
|
/// Whether the device supports large memory allocation.
|
|
bool SupportsLargeMem = false;
|
|
/// Cached max alloc size supported by device.
|
|
uint64_t MaxAllocSize;
|
|
/// Map from allocation kind to memory statistics.
|
|
std::array<MemStatTy, MaxMemKind> Stats;
|
|
/// Map from allocation kind to memory pool.
|
|
std::array<std::unique_ptr<MemPoolTy>, MaxMemKind> Pools;
|
|
|
|
/// Memory pool dedicated to reduction scratch space.
|
|
std::unique_ptr<MemPoolTy> ReductionPool;
|
|
/// Memory pool dedicated to reduction counters.
|
|
std::unique_ptr<MemPoolTy> CounterPool;
|
|
/// Allocation information map.
|
|
MemAllocInfoMapTy AllocInfo;
|
|
/// RTL-owned memory that needs to be freed automatically.
|
|
std::vector<void *> MemOwned;
|
|
/// Lock protection.
|
|
std::mutex Mtx;
|
|
/// Allocator only supports host memory.
|
|
bool IsHostMem = false;
|
|
// Internal deallocation function to be called when already
|
|
// hondling the Mtx lock.
|
|
Error deallocLocked(void *Ptr);
|
|
|
|
/// Allocate memory from L0 GPU RT.
|
|
Expected<void *> allocFromL0(size_t Size, size_t Align, int32_t Kind);
|
|
/// Deallocate memory from L0 GPU RT.
|
|
Error deallocFromL0(void *Ptr);
|
|
|
|
/// We use over-allocation workaround to support target pointer with
|
|
/// offset, and positive "ActiveSize" is specified in such cases to
|
|
/// correct debug logging.
|
|
Expected<void *> allocFromL0AndLog(size_t Size, size_t Align, int32_t Kind,
|
|
size_t ActiveSize = 0) {
|
|
auto MemOrErr = allocFromL0(Size, Align, Kind);
|
|
if (!MemOrErr)
|
|
return MemOrErr;
|
|
size_t LoggedSize = ActiveSize ? ActiveSize : Size;
|
|
log(LoggedSize, Size, Kind);
|
|
return MemOrErr;
|
|
}
|
|
|
|
/// Log memory allocation/deallocation.
|
|
void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
|
|
if (Kind < 0 || Kind >= MaxMemKind)
|
|
return; // Stat is disabled.
|
|
|
|
auto &ST = Stats[Kind];
|
|
int32_t I = Pool ? 1 : 0;
|
|
if (ReqSize > 0) {
|
|
ST.Requested[I] += ReqSize;
|
|
ST.Allocated[I] += Size;
|
|
ST.InUse[I] += Size;
|
|
ST.NumAllocs[I]++;
|
|
} else {
|
|
ST.Freed[I] += Size;
|
|
ST.InUse[I] -= Size;
|
|
}
|
|
ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
|
|
}
|
|
|
|
/// Perform copy operation.
|
|
Error enqueueMemCopy(void *Dst, const void *Src, size_t Size);
|
|
/// Perform memory fill operation.
|
|
Error enqueueMemSet(void *Dst, int8_t Value, size_t Size);
|
|
|
|
/// Allocate memory with the specified information from a memory pool.
|
|
Expected<void *> allocFromPool(size_t Size, size_t Align, int32_t Kind,
|
|
intptr_t Offset, bool UserAlloc,
|
|
bool DevMalloc, uint32_t MemAdvice,
|
|
AllocOptionTy AllocOpt);
|
|
/// Deallocate memory from memory pool.
|
|
Error deallocFromPool(void *Ptr) {
|
|
std::lock_guard<std::mutex> Lock(Mtx);
|
|
return deallocLocked(Ptr);
|
|
}
|
|
|
|
public:
|
|
MemAllocatorTy()
|
|
: MaxAllocSize(std::numeric_limits<decltype(MaxAllocSize)>::max()) {}
|
|
|
|
MemAllocatorTy(const MemAllocatorTy &) = delete;
|
|
MemAllocatorTy(MemAllocatorTy &&) = delete;
|
|
MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
|
|
MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
|
|
~MemAllocatorTy() = default;
|
|
|
|
Error initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
|
|
Error initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
|
|
void updateMaxAllocSize(L0DeviceTy &L0Device);
|
|
|
|
/// Release resources and report statistics if requested.
|
|
Error deinit();
|
|
|
|
/// Allocate memory with the specified information from a memory pool.
|
|
Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
|
|
intptr_t Offset, bool UserAlloc, bool DevMalloc,
|
|
uint32_t MemAdvice, AllocOptionTy AllocOpt) {
|
|
return allocFromPool(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
|
|
MemAdvice, AllocOpt);
|
|
}
|
|
|
|
/// Deallocate memory.
|
|
Error dealloc(void *Ptr) { return deallocFromPool(Ptr); }
|
|
|
|
/// Check if the given memory location and offset belongs to any allocated
|
|
/// memory.
|
|
bool contains(const void *Ptr, size_t Size) {
|
|
std::lock_guard<std::mutex> Lock(Mtx);
|
|
return AllocInfo.contains(Ptr, Size);
|
|
}
|
|
|
|
/// Get allocation information for the specified memory location.
|
|
const MemAllocInfoTy *getAllocInfo(void *Ptr) {
|
|
std::lock_guard<std::mutex> Lock(Mtx);
|
|
return AllocInfo.find(Ptr);
|
|
}
|
|
|
|
/// Get kernel indirect access flags using implicit argument info.
|
|
ze_kernel_indirect_access_flags_t getIndirectFlags() {
|
|
std::lock_guard<std::mutex> Lock(Mtx);
|
|
ze_kernel_indirect_access_flags_t Ret = 0;
|
|
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
|
|
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
|
|
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
|
|
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
|
|
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
|
|
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
|
|
return Ret;
|
|
}
|
|
}; /// MemAllocatorTy
|
|
|
|
// Simple generic wrapper to reuse objects
|
|
// objects must have zero argument accessible constructor.
|
|
template <class ObjTy> class ObjPool {
|
|
// Protection.
|
|
std::unique_ptr<std::mutex> Mtx;
|
|
// List of Objects.
|
|
std::list<ObjTy *> Objects;
|
|
|
|
public:
|
|
ObjPool() { Mtx.reset(new std::mutex); }
|
|
|
|
ObjPool(const ObjPool &) = delete;
|
|
ObjPool(ObjPool &) = delete;
|
|
ObjPool &operator=(const ObjPool &) = delete;
|
|
ObjPool &operator=(const ObjPool &&) = delete;
|
|
|
|
ObjTy *get() {
|
|
if (!Objects.empty()) {
|
|
std::lock_guard<std::mutex> Lock(*Mtx);
|
|
if (!Objects.empty()) {
|
|
const auto Ret = Objects.back();
|
|
Objects.pop_back();
|
|
return Ret;
|
|
}
|
|
}
|
|
return new ObjTy();
|
|
}
|
|
|
|
void release(ObjTy *obj) {
|
|
std::lock_guard<std::mutex> Lock(*Mtx);
|
|
Objects.push_back(obj);
|
|
}
|
|
|
|
~ObjPool() {
|
|
for (auto Object : Objects)
|
|
delete Object;
|
|
}
|
|
};
|
|
|
|
/// Common event pool used in the plugin. This event pool assumes all events
|
|
/// from the pool are host-visible and use the same event pool flag.
|
|
class EventPoolTy {
|
|
/// Size of L0 event pool created on demand.
|
|
size_t PoolSize = 64;
|
|
|
|
/// Context of the events.
|
|
ze_context_handle_t Context = nullptr;
|
|
|
|
/// Additional event pool flags common to this pull.
|
|
uint32_t Flags = 0;
|
|
|
|
/// Protection.
|
|
std::unique_ptr<std::mutex> Mtx;
|
|
|
|
/// List of created L0 event pools.
|
|
std::list<ze_event_pool_handle_t> Pools;
|
|
|
|
/// List of free L0 events.
|
|
std::list<ze_event_handle_t> Events;
|
|
|
|
#ifdef OMPT_SUPPORT
|
|
/// Event to OMPT record map. The timestamp information is recorded to the
|
|
/// OMPT record before the event is recycled.
|
|
std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
|
|
#endif // OMPT_SUPPORT
|
|
|
|
public:
|
|
/// Initialize context, flags, and mutex.
|
|
Error init(ze_context_handle_t ContextIn, uint32_t FlagsIn) {
|
|
Context = ContextIn;
|
|
Flags = FlagsIn;
|
|
Mtx.reset(new std::mutex);
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Destroys L0 resources.
|
|
Error deinit() {
|
|
for (auto E : Events)
|
|
CALL_ZE_RET_ERROR(zeEventDestroy, E);
|
|
for (auto P : Pools)
|
|
CALL_ZE_RET_ERROR(zeEventPoolDestroy, P);
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Get a free event from the pool.
|
|
Expected<ze_event_handle_t> getEvent();
|
|
|
|
/// Return an event to the pool.
|
|
Error releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
|
|
};
|
|
|
|
/// Staging buffer.
|
|
/// A single staging buffer is not enough when batching is enabled since there
|
|
/// can be multiple pending copy operations.
|
|
class StagingBufferTy {
|
|
/// Context for L0 calls.
|
|
ze_context_handle_t Context = nullptr;
|
|
/// Max allowed size for staging buffer.
|
|
size_t Size = L0StagingBufferSize;
|
|
/// Number of buffers allocated together.
|
|
size_t Count = L0StagingBufferCount;
|
|
/// Buffers increasing by Count if a new buffer is required.
|
|
llvm::SmallVector<void *> Buffers;
|
|
/// Next buffer location in the buffers.
|
|
size_t Offset = 0;
|
|
|
|
Expected<void *> addBuffers() {
|
|
ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
|
|
nullptr, 0};
|
|
void *Ret = nullptr;
|
|
size_t AllocSize = Size * Count;
|
|
CALL_ZE_RET_ERROR(zeMemAllocHost, Context, &AllocDesc, AllocSize,
|
|
L0DefaultAlignment, &Ret);
|
|
Buffers.push_back(Ret);
|
|
return Ret;
|
|
}
|
|
|
|
public:
|
|
StagingBufferTy() = default;
|
|
StagingBufferTy(const StagingBufferTy &) = delete;
|
|
StagingBufferTy(StagingBufferTy &&) = delete;
|
|
StagingBufferTy &operator=(const StagingBufferTy &) = delete;
|
|
StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
|
|
~StagingBufferTy() = default;
|
|
|
|
Error clear() {
|
|
for (auto Ptr : Buffers)
|
|
CALL_ZE_RET_ERROR(zeMemFree, Context, Ptr);
|
|
Context = nullptr;
|
|
return Plugin::success();
|
|
}
|
|
|
|
bool initialized() const { return Context != nullptr; }
|
|
|
|
void init(ze_context_handle_t ContextIn, size_t SizeIn, size_t CountIn) {
|
|
Context = ContextIn;
|
|
Size = SizeIn;
|
|
Count = CountIn;
|
|
}
|
|
|
|
void reset() { Offset = 0; }
|
|
|
|
/// Always return the first buffer.
|
|
Expected<void *> get() {
|
|
if (Size == 0 || Count == 0)
|
|
return nullptr;
|
|
return Buffers.empty() ? addBuffers() : Buffers.front();
|
|
}
|
|
|
|
/// Return the next available buffer.
|
|
Expected<void *> getNext() {
|
|
void *Ret = nullptr;
|
|
if (Size == 0 || Count == 0)
|
|
return Ret;
|
|
|
|
size_t AllocSize = Size * Count;
|
|
bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
|
|
if (NeedToGrow) {
|
|
auto PtrOrErr = addBuffers();
|
|
if (!PtrOrErr)
|
|
return PtrOrErr.takeError();
|
|
Ret = *PtrOrErr;
|
|
} else
|
|
Ret = reinterpret_cast<void *>(
|
|
reinterpret_cast<uintptr_t>(Buffers.back()) + (Offset % AllocSize));
|
|
|
|
if (!Ret)
|
|
return nullptr;
|
|
|
|
Offset += Size;
|
|
return Ret;
|
|
}
|
|
|
|
/// Return either a fixed buffer or next buffer.
|
|
Expected<void *> get(bool Next) { return Next ? getNext() : get(); }
|
|
};
|
|
|
|
} // namespace llvm::omp::target::plugin
|
|
|
|
#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
|