blob: d4a46d50800014581f588579eeefc66a5bc11ca5 [file] [log] [blame] [edit]
//===--- Level Zero Target RTL Implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Memory related support for SPIR-V/Xe machine.
//
//===----------------------------------------------------------------------===//
#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H
#include <cassert>
#include <level_zero/ze_api.h>
#include <list>
#include <map>
#include <memory>
#include <mutex>
#include "L0Defs.h"
#include "L0Trace.h"
namespace llvm::omp::target::plugin {
class L0DeviceTy;
// Forward declarations.
struct L0OptionsTy;
class L0DeviceTy;
class L0ContextTy;
constexpr static int32_t MaxMemKind = TARGET_ALLOC_LAST + 1;
struct DynamicMemHeapTy {
/// Base address memory is allocated from.
uintptr_t AllocBase = 0;
/// Minimal size served by the current heap.
size_t BlockSize = 0;
/// Max size served by the current heap.
size_t MaxSize = 0;
/// Available memory blocks.
uint32_t NumBlocks = 0;
/// Number of block descriptors.
uint32_t NumBlockDesc = 0;
/// Number of block counters.
uint32_t NumBlockCounter = 0;
/// List of memory block descriptors.
uint64_t *BlockDesc = nullptr;
/// List of memory block counters.
uint32_t *BlockCounter = nullptr;
};
struct DynamicMemPoolTy {
/// Location of device memory blocks.
void *PoolBase = nullptr;
/// Heap size common to all heaps.
size_t HeapSize = 0;
/// Number of heaps available.
uint32_t NumHeaps = 0;
/// Heap descriptors (using fixed-size array to simplify memory allocation).
DynamicMemHeapTy HeapDesc[8];
};
/// Memory allocation information used in memory allocation/deallocation.
struct MemAllocInfoTy {
/// Base address allocated from compute runtime.
void *Base = nullptr;
/// Allocation size known to users/libomptarget.
size_t ReqSize = 0;
/// Allocation size known to the plugin (can be larger than ReqSize).
size_t AllocSize = 0;
/// TARGET_ALLOC kind.
int32_t Kind = TARGET_ALLOC_DEFAULT;
/// Is the allocation from a pool?
bool InPool = false;
/// Is an implicit argument?
bool ImplicitArg = false;
MemAllocInfoTy() = default;
MemAllocInfoTy(void *Base, size_t ReqSize, size_t AllocSize, int32_t Kind,
bool InPool, bool ImplicitArg)
: Base(Base), ReqSize(ReqSize), AllocSize(AllocSize), Kind(Kind),
InPool(InPool), ImplicitArg(ImplicitArg) {}
};
/// Responsible for all activities involving memory allocation/deallocation.
/// It contains memory pool management, memory allocation bookkeeping.
class MemAllocatorTy {
/// Simple memory allocation statistics. Maintains numbers for pool allocation
/// and GPU RT allocation.
struct MemStatTy {
size_t Requested[2] = {0, 0}; // Requested bytes.
size_t Allocated[2] = {0, 0}; // Allocated bytes.
size_t Freed[2] = {0, 0}; // Freed bytes.
size_t InUse[2] = {0, 0}; // Current memory in use.
size_t PeakUse[2] = {0, 0}; // Peak bytes used.
size_t NumAllocs[2] = {0, 0}; // Number of allocations.
};
/// Memory pool which enables reuse of already allocated blocks:
/// -- Pool maintains a list of buckets each of which can allocate fixed-size
/// memory.
/// -- Each bucket maintains a list of memory blocks allocated by GPU RT.
/// -- Each memory block can allocate multiple fixed-size memory requested by
/// offload RT or user.
/// -- Memory allocation falls back to GPU RT allocation when the pool size
/// (total memory used by pool) reaches a threshold.
class MemPoolTy {
/// Memory block maintained in each bucket.
struct BlockTy {
/// Base address of this block.
uintptr_t Base = 0;
/// Size of the block.
size_t Size = 0;
/// Supported allocation size by this block.
size_t ChunkSize = 0;
/// Total number of slots.
uint32_t NumSlots = 0;
/// Maximum slot value.
static constexpr uint32_t MaxSlots =
std::numeric_limits<decltype(NumSlots)>::max();
/// Number of slots in use.
uint32_t NumUsedSlots = 0;
/// Cached available slot returned by the last dealloc() call.
uint32_t FreeSlot = MaxSlots;
/// Marker for the currently used slots.
std::vector<bool> UsedSlots;
BlockTy(void *_Base, size_t _Size, size_t _ChunkSize) {
Base = reinterpret_cast<uintptr_t>(_Base);
Size = _Size;
ChunkSize = _ChunkSize;
NumSlots = Size / ChunkSize;
NumUsedSlots = 0;
UsedSlots.resize(NumSlots, /*InitValue=*/false);
}
/// Check if the current block is fully used.
bool isFull() const { return NumUsedSlots == NumSlots; }
/// Check if the given address belongs to the current block.
bool contains(void *Mem) const {
auto M = reinterpret_cast<uintptr_t>(Mem);
return M >= Base && M < Base + Size;
}
/// Allocate a single chunk from the block.
void *alloc();
/// Deallocate the given memory.
void dealloc(void *Mem);
}; // BlockTy
/// Allocation kind for the current pool.
int32_t AllocKind = TARGET_ALLOC_DEFAULT;
/// Access to the allocator.
MemAllocatorTy *Allocator = nullptr;
/// Minimum supported memory allocation size from pool.
size_t AllocMin = 1 << 6; // 64B
/// Maximum supported memory allocation size from pool.
size_t AllocMax = 0;
/// Allocation size when the pool needs to allocate a block.
size_t AllocUnit = 1 << 16; // 64KB
/// Capacity of each block in the buckets which decides number of
/// allocatable chunks from the block. Each block in the bucket can serve
/// at least BlockCapacity chunks.
/// If ChunkSize * BlockCapacity <= AllocUnit
/// BlockSize = AllocUnit
/// Otherwise,
/// BlockSize = ChunkSize * BlockCapacity
/// This simply means how much memory is over-allocated.
uint32_t BlockCapacity = 0;
/// Total memory allocated from GPU RT for this pool.
size_t PoolSize = 0;
/// Maximum allowed pool size. Allocation falls back to GPU RT allocation if
/// when PoolSize reaches PoolSizeMax.
size_t PoolSizeMax = 0;
/// Small allocation size allowed in the pool even if pool size is over the
/// pool size limit.
size_t SmallAllocMax = 1024;
/// Small allocation pool size.
size_t SmallPoolSize = 0;
/// Small allocation pool size max (4MB).
size_t SmallPoolSizeMax = (4 << 20);
/// List of buckets.
std::vector<std::vector<BlockTy *>> Buckets;
/// List of bucket parameters.
std::vector<std::pair<size_t, size_t>> BucketParams;
/// Map from allocated pointer to corresponding block.
llvm::DenseMap<void *, BlockTy *> PtrToBlock;
/// Simple stats counting miss/hit in each bucket.
std::vector<std::pair<uint64_t, uint64_t>> BucketStats;
/// Need to zero-initialize after L0 allocation.
bool ZeroInit = false;
/// Get bucket ID from the specified allocation size.
uint32_t getBucketId(size_t Size) {
uint32_t Count = 0;
for (size_t SZ = AllocMin; SZ < Size; Count++)
SZ <<= 1;
return Count;
}
public:
MemPoolTy() = default;
MemPoolTy(const MemPoolTy &) = delete;
MemPoolTy(MemPoolTy &&) = delete;
MemPoolTy &operator=(const MemPoolTy &) = delete;
MemPoolTy &operator=(const MemPoolTy &&) = delete;
~MemPoolTy() = default;
void printUsage();
/// Initialize pool with allocation kind, allocator, and user options.
Error init(int32_t Kind, MemAllocatorTy *Allocator,
const L0OptionsTy &Option);
// Initialize pool used for reduction pool.
Error init(MemAllocatorTy *Allocator, const L0OptionsTy &Option);
// Initialize pool used for small memory pool with fixed parameters.
Error init(MemAllocatorTy *Allocator);
/// Release resources used in the pool.
Error deinit();
/// Allocate the requested size of memory from this pool.
/// AllocSize is the chunk size internally used for the returned memory.
Expected<void *> alloc(size_t Size, size_t &AllocSize);
/// Deallocate the specified memory and returns block size deallocated.
size_t dealloc(void *Ptr);
}; // MemPoolTy
/// Allocation information maintained in the plugin.
class MemAllocInfoMapTy {
/// Map from allocated pointer to allocation information.
std::map<void *, MemAllocInfoTy> Map;
/// Map from target alloc kind to number of implicit arguments.
std::array<uint32_t, MaxMemKind> NumImplicitArgs;
public:
/// Add allocation information to the map.
void add(void *Ptr, void *Base, size_t ReqSize, size_t AllocSize,
int32_t Kind, bool InPool = false, bool ImplicitArg = false);
/// Remove allocation information for the given memory location.
bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr);
/// Finds allocation information for the given memory location.
const MemAllocInfoTy *find(void *Ptr) const {
auto AllocInfo = Map.find(Ptr);
if (AllocInfo == Map.end())
return nullptr;
else
return &AllocInfo->second;
}
/// Check if the map contains the given pointer and offset.
bool contains(const void *Ptr, size_t Size) const {
if (Map.size() == 0)
return false;
auto I = Map.upper_bound(const_cast<void *>(Ptr));
if (I == Map.begin())
return false;
--I;
uintptr_t PtrAsInt = reinterpret_cast<uintptr_t>(Ptr);
uintptr_t MapBase = reinterpret_cast<uintptr_t>(I->first);
uintptr_t MapSize = static_cast<uintptr_t>(I->second.ReqSize);
bool Ret = MapBase <= PtrAsInt && PtrAsInt + Size <= MapBase + MapSize;
return Ret;
}
/// Returns the number of implicit arguments for the specified allocation
/// kind.
size_t getNumImplicitArgs(int32_t Kind) {
assert(Kind >= 0 && Kind < MaxMemKind &&
"Invalid target allocation kind");
return NumImplicitArgs[Kind];
}
}; // MemAllocInfoMapTy
/// L0 context to use.
const L0ContextTy *L0Context = nullptr;
/// L0 device to use.
L0DeviceTy *Device = nullptr;
/// Whether the device supports large memory allocation.
bool SupportsLargeMem = false;
/// Cached max alloc size supported by device.
uint64_t MaxAllocSize;
/// Map from allocation kind to memory statistics.
std::array<MemStatTy, MaxMemKind> Stats;
/// Map from allocation kind to memory pool.
std::array<std::unique_ptr<MemPoolTy>, MaxMemKind> Pools;
/// Memory pool dedicated to reduction scratch space.
std::unique_ptr<MemPoolTy> ReductionPool;
/// Memory pool dedicated to reduction counters.
std::unique_ptr<MemPoolTy> CounterPool;
/// Allocation information map.
MemAllocInfoMapTy AllocInfo;
/// RTL-owned memory that needs to be freed automatically.
std::vector<void *> MemOwned;
/// Lock protection.
std::mutex Mtx;
/// Allocator only supports host memory.
bool IsHostMem = false;
// Internal deallocation function to be called when already
// hondling the Mtx lock.
Error deallocLocked(void *Ptr);
/// Allocate memory from L0 GPU RT.
Expected<void *> allocFromL0(size_t Size, size_t Align, int32_t Kind);
/// Deallocate memory from L0 GPU RT.
Error deallocFromL0(void *Ptr);
/// We use over-allocation workaround to support target pointer with
/// offset, and positive "ActiveSize" is specified in such cases to
/// correct debug logging.
Expected<void *> allocFromL0AndLog(size_t Size, size_t Align, int32_t Kind,
size_t ActiveSize = 0) {
auto MemOrErr = allocFromL0(Size, Align, Kind);
if (!MemOrErr)
return MemOrErr;
size_t LoggedSize = ActiveSize ? ActiveSize : Size;
log(LoggedSize, Size, Kind);
return MemOrErr;
}
/// Log memory allocation/deallocation.
void log(size_t ReqSize, size_t Size, int32_t Kind, bool Pool = false) {
if (Kind < 0 || Kind >= MaxMemKind)
return; // Stat is disabled.
auto &ST = Stats[Kind];
int32_t I = Pool ? 1 : 0;
if (ReqSize > 0) {
ST.Requested[I] += ReqSize;
ST.Allocated[I] += Size;
ST.InUse[I] += Size;
ST.NumAllocs[I]++;
} else {
ST.Freed[I] += Size;
ST.InUse[I] -= Size;
}
ST.PeakUse[I] = (std::max)(ST.PeakUse[I], ST.InUse[I]);
}
/// Perform copy operation.
Error enqueueMemCopy(void *Dst, const void *Src, size_t Size);
/// Perform memory fill operation.
Error enqueueMemSet(void *Dst, int8_t Value, size_t Size);
/// Allocate memory with the specified information from a memory pool.
Expected<void *> allocFromPool(size_t Size, size_t Align, int32_t Kind,
intptr_t Offset, bool UserAlloc,
bool DevMalloc, uint32_t MemAdvice,
AllocOptionTy AllocOpt);
/// Deallocate memory from memory pool.
Error deallocFromPool(void *Ptr) {
std::lock_guard<std::mutex> Lock(Mtx);
return deallocLocked(Ptr);
}
public:
MemAllocatorTy()
: MaxAllocSize(std::numeric_limits<decltype(MaxAllocSize)>::max()) {}
MemAllocatorTy(const MemAllocatorTy &) = delete;
MemAllocatorTy(MemAllocatorTy &&) = delete;
MemAllocatorTy &operator=(const MemAllocatorTy &) = delete;
MemAllocatorTy &operator=(const MemAllocatorTy &&) = delete;
~MemAllocatorTy() = default;
Error initDevicePools(L0DeviceTy &L0Device, const L0OptionsTy &Option);
Error initHostPool(L0ContextTy &Driver, const L0OptionsTy &Option);
void updateMaxAllocSize(L0DeviceTy &L0Device);
/// Release resources and report statistics if requested.
Error deinit();
/// Allocate memory with the specified information from a memory pool.
Expected<void *> alloc(size_t Size, size_t Align, int32_t Kind,
intptr_t Offset, bool UserAlloc, bool DevMalloc,
uint32_t MemAdvice, AllocOptionTy AllocOpt) {
return allocFromPool(Size, Align, Kind, Offset, UserAlloc, DevMalloc,
MemAdvice, AllocOpt);
}
/// Deallocate memory.
Error dealloc(void *Ptr) { return deallocFromPool(Ptr); }
/// Check if the given memory location and offset belongs to any allocated
/// memory.
bool contains(const void *Ptr, size_t Size) {
std::lock_guard<std::mutex> Lock(Mtx);
return AllocInfo.contains(Ptr, Size);
}
/// Get allocation information for the specified memory location.
const MemAllocInfoTy *getAllocInfo(void *Ptr) {
std::lock_guard<std::mutex> Lock(Mtx);
return AllocInfo.find(Ptr);
}
/// Get kernel indirect access flags using implicit argument info.
ze_kernel_indirect_access_flags_t getIndirectFlags() {
std::lock_guard<std::mutex> Lock(Mtx);
ze_kernel_indirect_access_flags_t Ret = 0;
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_DEVICE) > 0)
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE;
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_HOST) > 0)
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST;
if (AllocInfo.getNumImplicitArgs(TARGET_ALLOC_SHARED) > 0)
Ret |= ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
return Ret;
}
}; /// MemAllocatorTy
// Simple generic wrapper to reuse objects
// objects must have zero argument accessible constructor.
template <class ObjTy> class ObjPool {
// Protection.
std::unique_ptr<std::mutex> Mtx;
// List of Objects.
std::list<ObjTy *> Objects;
public:
ObjPool() { Mtx.reset(new std::mutex); }
ObjPool(const ObjPool &) = delete;
ObjPool(ObjPool &) = delete;
ObjPool &operator=(const ObjPool &) = delete;
ObjPool &operator=(const ObjPool &&) = delete;
ObjTy *get() {
if (!Objects.empty()) {
std::lock_guard<std::mutex> Lock(*Mtx);
if (!Objects.empty()) {
const auto Ret = Objects.back();
Objects.pop_back();
return Ret;
}
}
return new ObjTy();
}
void release(ObjTy *obj) {
std::lock_guard<std::mutex> Lock(*Mtx);
Objects.push_back(obj);
}
~ObjPool() {
for (auto Object : Objects)
delete Object;
}
};
/// Common event pool used in the plugin. This event pool assumes all events
/// from the pool are host-visible and use the same event pool flag.
class EventPoolTy {
/// Size of L0 event pool created on demand.
size_t PoolSize = 64;
/// Context of the events.
ze_context_handle_t Context = nullptr;
/// Additional event pool flags common to this pull.
uint32_t Flags = 0;
/// Protection.
std::unique_ptr<std::mutex> Mtx;
/// List of created L0 event pools.
std::list<ze_event_pool_handle_t> Pools;
/// List of free L0 events.
std::list<ze_event_handle_t> Events;
#ifdef OMPT_SUPPORT
/// Event to OMPT record map. The timestamp information is recorded to the
/// OMPT record before the event is recycled.
std::unordered_map<ze_event_handle_t, ompt_record_ompt_t *> EventToRecord;
#endif // OMPT_SUPPORT
public:
/// Initialize context, flags, and mutex.
Error init(ze_context_handle_t ContextIn, uint32_t FlagsIn) {
Context = ContextIn;
Flags = FlagsIn;
Mtx.reset(new std::mutex);
return Plugin::success();
}
/// Destroys L0 resources.
Error deinit() {
for (auto E : Events)
CALL_ZE_RET_ERROR(zeEventDestroy, E);
for (auto P : Pools)
CALL_ZE_RET_ERROR(zeEventPoolDestroy, P);
return Plugin::success();
}
/// Get a free event from the pool.
Expected<ze_event_handle_t> getEvent();
/// Return an event to the pool.
Error releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device);
};
/// Staging buffer.
/// A single staging buffer is not enough when batching is enabled since there
/// can be multiple pending copy operations.
class StagingBufferTy {
/// Context for L0 calls.
ze_context_handle_t Context = nullptr;
/// Max allowed size for staging buffer.
size_t Size = L0StagingBufferSize;
/// Number of buffers allocated together.
size_t Count = L0StagingBufferCount;
/// Buffers increasing by Count if a new buffer is required.
llvm::SmallVector<void *> Buffers;
/// Next buffer location in the buffers.
size_t Offset = 0;
Expected<void *> addBuffers() {
ze_host_mem_alloc_desc_t AllocDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
nullptr, 0};
void *Ret = nullptr;
size_t AllocSize = Size * Count;
CALL_ZE_RET_ERROR(zeMemAllocHost, Context, &AllocDesc, AllocSize,
L0DefaultAlignment, &Ret);
Buffers.push_back(Ret);
return Ret;
}
public:
StagingBufferTy() = default;
StagingBufferTy(const StagingBufferTy &) = delete;
StagingBufferTy(StagingBufferTy &&) = delete;
StagingBufferTy &operator=(const StagingBufferTy &) = delete;
StagingBufferTy &operator=(const StagingBufferTy &&) = delete;
~StagingBufferTy() = default;
Error clear() {
for (auto Ptr : Buffers)
CALL_ZE_RET_ERROR(zeMemFree, Context, Ptr);
Context = nullptr;
return Plugin::success();
}
bool initialized() const { return Context != nullptr; }
void init(ze_context_handle_t ContextIn, size_t SizeIn, size_t CountIn) {
Context = ContextIn;
Size = SizeIn;
Count = CountIn;
}
void reset() { Offset = 0; }
/// Always return the first buffer.
Expected<void *> get() {
if (Size == 0 || Count == 0)
return nullptr;
return Buffers.empty() ? addBuffers() : Buffers.front();
}
/// Return the next available buffer.
Expected<void *> getNext() {
void *Ret = nullptr;
if (Size == 0 || Count == 0)
return Ret;
size_t AllocSize = Size * Count;
bool NeedToGrow = Buffers.empty() || Offset >= Buffers.size() * AllocSize;
if (NeedToGrow) {
auto PtrOrErr = addBuffers();
if (!PtrOrErr)
return PtrOrErr.takeError();
Ret = *PtrOrErr;
} else
Ret = reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(Buffers.back()) + (Offset % AllocSize));
if (!Ret)
return nullptr;
Offset += Size;
return Ret;
}
/// Return either a fixed buffer or next buffer.
Expected<void *> get(bool Next) { return Next ? getNext() : get(); }
};
} // namespace llvm::omp::target::plugin
#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0MEMORY_H