| //===--- Level Zero Target RTL Implementation -----------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // Memory related support for SPIR-V/Xe machine. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "L0Memory.h" |
| #include "L0Device.h" |
| #include "L0Plugin.h" |
| |
| namespace llvm::omp::target::plugin { |
| |
| #if OMPTARGET_DEBUG |
| static const char *AllocKindToStr(int32_t Kind) { |
| switch (Kind) { |
| case TARGET_ALLOC_DEVICE: |
| return "DEVICE"; |
| case TARGET_ALLOC_HOST: |
| return "HOST"; |
| case TARGET_ALLOC_SHARED: |
| return "SHARED"; |
| default: |
| return "DEFAULT"; |
| } |
| } |
| #endif |
| |
| void *MemAllocatorTy::MemPoolTy::BlockTy::alloc() { |
| if (isFull()) |
| return nullptr; |
| |
| if (FreeSlot != MaxSlots) { |
| const uint32_t Slot = FreeSlot; |
| FreeSlot = MaxSlots; |
| UsedSlots[Slot] = true; |
| NumUsedSlots++; |
| return reinterpret_cast<void *>(Base + Slot * ChunkSize); |
| } |
| for (uint32_t I = 0; I < NumSlots; I++) { |
| if (UsedSlots[I]) |
| continue; |
| UsedSlots[I] = true; |
| NumUsedSlots++; |
| return reinterpret_cast<void *>(Base + I * ChunkSize); |
| } |
| // Should not reach here. |
| assert(false && "Inconsistent memory pool state"); |
| return nullptr; |
| } |
| |
| /// Deallocate the given memory. |
| void MemAllocatorTy::MemPoolTy::BlockTy::dealloc(void *Mem) { |
| if (!contains(Mem)) |
| assert(0 && "Inconsistent memory pool state"); |
| const uint32_t Slot = (reinterpret_cast<uintptr_t>(Mem) - Base) / ChunkSize; |
| UsedSlots[Slot] = false; |
| NumUsedSlots--; |
| FreeSlot = Slot; |
| } |
| |
| Error MemAllocatorTy::MemPoolTy::init(int32_t Kind, MemAllocatorTy *AllocatorIn, |
| const L0OptionsTy &Option) { |
| AllocKind = Kind; |
| Allocator = AllocatorIn; |
| |
| // Read user-defined options. |
| const auto &UserOptions = Option.MemPoolConfig[AllocKind]; |
| const size_t UserAllocMax = UserOptions.AllocMax; |
| const size_t UserCapacity = UserOptions.Capacity; |
| const size_t UserPoolSize = UserOptions.PoolSize; |
| |
| BlockCapacity = UserCapacity; |
| PoolSizeMax = UserPoolSize << 20; // Covert MB to B. |
| PoolSize = 0; |
| |
| auto Context = Allocator->L0Context->getZeContext(); |
| const auto Device = Allocator->Device; |
| |
| // Check page size used for this allocation kind to decide minimum. |
| // allocation size when allocating from L0. |
| auto MemOrErr = Allocator->allocFromL0(8, 0, AllocKind); |
| if (!MemOrErr) |
| return MemOrErr.takeError(); |
| void *Mem = *MemOrErr; |
| ze_memory_allocation_properties_t AP{ |
| ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, nullptr, |
| ZE_MEMORY_TYPE_UNKNOWN, 0, 0}; |
| CALL_ZE_RET_ERROR(zeMemGetAllocProperties, Context, Mem, &AP, nullptr); |
| AllocUnit = (std::max)(AP.pageSize, AllocUnit); |
| if (auto Err = Allocator->deallocFromL0(Mem)) |
| return Err; |
| |
| bool IsDiscrete = false; |
| if (Device) { |
| ze_device_properties_t Properties{}; |
| Properties.deviceId = 0; |
| Properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; |
| Properties.pNext = nullptr; |
| CALL_ZE_RET_ERROR(zeDeviceGetProperties, Device->getZeDevice(), |
| &Properties); |
| IsDiscrete = Device->isDiscreteDevice(); |
| |
| if (AllocKind == TARGET_ALLOC_SHARED && IsDiscrete) { |
| // Use page size as minimum chunk size for USM shared on discrete. |
| // device. |
| // FIXME: pageSize is not returned correctly (=0) on some new devices, |
| // so use fallback value for now. |
| AllocMin = (std::max)(AP.pageSize, AllocUnit); |
| AllocUnit = AllocMin * BlockCapacity; |
| } |
| } |
| |
| // Convert MB to B and round up to power of 2. |
| AllocMax = AllocMin << getBucketId(UserAllocMax * (1 << 20)); |
| if (AllocMin >= AllocMax) { |
| AllocMax = 2 * AllocMin; |
| DP("Warning: Adjusting pool's AllocMax to %zu for %s due to device " |
| "requirements.\n", |
| AllocMax, AllocKindToStr(AllocKind)); |
| } |
| assert(AllocMin < AllocMax && |
| "Invalid parameters while initializing memory pool"); |
| const auto MinSize = getBucketId(AllocMin); |
| const auto MaxSize = getBucketId(AllocMax); |
| Buckets.resize(MaxSize - MinSize + 1); |
| BucketStats.resize(Buckets.size(), {0, 0}); |
| |
| // Set bucket parameters |
| for (size_t I = 0; I < Buckets.size(); I++) { |
| const size_t ChunkSize = AllocMin << I; |
| size_t BlockSize = ChunkSize * BlockCapacity; |
| // On discrete device, the cost of native L0 invocation doubles when the |
| // the requested size doubles after certain threshold, so allocating |
| // larger block does not pay off at all. It is better to keep a single |
| // chunk in a single block in such cases. |
| if (BlockSize <= AllocUnit) { |
| BlockSize = AllocUnit; // Allocation unit is already large enough. |
| } else if (IsDiscrete) { |
| // Do not preallocate if it does not pay off. |
| if (ChunkSize >= L0UsmPreAllocThreshold || |
| (AllocKind == TARGET_ALLOC_HOST && |
| ChunkSize >= L0HostUsmPreAllocThreshold)) |
| BlockSize = ChunkSize; |
| } |
| BucketParams.emplace_back(ChunkSize, BlockSize); |
| } |
| |
| DP("Initialized %s pool for device " DPxMOD ": AllocUnit = %zu, " |
| "AllocMax = %zu, " |
| "Capacity = %" PRIu32 ", PoolSizeMax = %zu\n", |
| AllocKindToStr(AllocKind), DPxPTR(Device), AllocUnit, AllocMax, |
| BlockCapacity, PoolSizeMax); |
| return Plugin::success(); |
| } |
| |
| // Used for reduction pool. |
| Error MemAllocatorTy::MemPoolTy::init(MemAllocatorTy *AllocatorIn, |
| const L0OptionsTy &Option) { |
| AllocKind = TARGET_ALLOC_DEVICE; |
| Allocator = AllocatorIn; |
| AllocMin = AllocUnit = 1024 << 6; // 64KB. |
| AllocMax = Option.ReductionPoolInfo[0] << 20; |
| BlockCapacity = Option.ReductionPoolInfo[1]; |
| PoolSize = 0; |
| PoolSizeMax = (size_t)Option.ReductionPoolInfo[2] << 20; |
| |
| const auto MinSize = getBucketId(AllocMin); |
| const auto MaxSize = getBucketId(AllocMax); |
| Buckets.resize(MaxSize - MinSize + 1); |
| BucketStats.resize(Buckets.size(), {0, 0}); |
| for (size_t I = 0; I < Buckets.size(); I++) { |
| const size_t ChunkSize = AllocMin << I; |
| BucketParams.emplace_back(ChunkSize, ChunkSize * BlockCapacity); |
| } |
| |
| DP("Initialized reduction scratch pool for device " DPxMOD |
| ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n", |
| DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax); |
| return Plugin::success(); |
| } |
| |
| // Used for small memory pool with fixed parameters. |
| Error MemAllocatorTy::MemPoolTy::init(MemAllocatorTy *AllocatorIn) { |
| AllocKind = TARGET_ALLOC_DEVICE; |
| Allocator = AllocatorIn; |
| AllocMax = AllocMin; |
| BlockCapacity = AllocUnit / AllocMax; |
| PoolSize = 0; |
| PoolSizeMax = (1 << 20); // This should be sufficiently large. |
| Buckets.resize(1); |
| BucketStats.resize(1, {0, 0}); |
| BucketParams.emplace_back(AllocMax, AllocUnit); |
| ZeroInit = true; |
| DP("Initialized zero-initialized reduction counter pool for " |
| "device " DPxMOD ": AllocMin = %zu, AllocMax = %zu, PoolSizeMax = %zu\n", |
| DPxPTR(Allocator->Device), AllocMin, AllocMax, PoolSizeMax); |
| return Plugin::success(); |
| } |
| |
| void MemAllocatorTy::MemPoolTy::printUsage() { |
| ODBG_OS([&](llvm::raw_ostream &Os) { |
| auto PrintNum = [&](uint64_t Num) { |
| if (Num > 1e9) |
| Os << llvm::format("%.2e", float(Num)); |
| else |
| Os << llvm::format("%11" PRIu64, Num); |
| }; |
| |
| bool HasPoolAlloc = false; |
| for (auto &Stat : BucketStats) { |
| if (Stat.first > 0 || Stat.second > 0) { |
| HasPoolAlloc = true; |
| break; |
| } |
| } |
| |
| Os << "MemPool usage for " << AllocKindToStr(AllocKind) << ", device " |
| << Allocator->Device << "\n"; |
| |
| if (HasPoolAlloc) { |
| Os << "-- AllocMax=" << (AllocMax >> 20) |
| << "(MB), Capacity=" << BlockCapacity |
| << ", PoolSizeMax=" << (PoolSizeMax >> 20) << "(MB)\n"; |
| Os << "-- " |
| << llvm::format("%18s:%11s%11s%11s\n", "", "NewAlloc", "Reuse", |
| "Hit(%)"); |
| for (size_t I = 0; I < Buckets.size(); I++) { |
| const auto &Stat = BucketStats[I]; |
| if (Stat.first > 0 || Stat.second > 0) { |
| Os << "-- Bucket[" << llvm::format("%10zu", BucketParams[I].first) |
| << "]:"; |
| PrintNum(Stat.first); |
| PrintNum(Stat.second); |
| Os << llvm::format("%11.2f\n", float(Stat.second) / |
| float(Stat.first + Stat.second) * |
| 100); |
| } |
| } |
| } else { |
| Os << "-- Not used\n"; |
| } |
| }); |
| } |
| |
| /// Release resources used in the pool. |
| Error MemAllocatorTy::MemPoolTy::deinit() { |
| printUsage(); |
| for (auto &Bucket : Buckets) { |
| for (auto *Block : Bucket) { |
| ODBG_IF([&]() { Allocator->log(0, Block->Size, AllocKind); }); |
| auto Err = |
| Allocator->deallocFromL0(reinterpret_cast<void *>(Block->Base)); |
| delete Block; |
| if (Err) |
| return Err; |
| } |
| } |
| return Plugin::success(); |
| } |
| |
| /// Allocate the requested size of memory from this pool. |
| /// AllocSize is the chunk size internally used for the returned memory. |
| Expected<void *> MemAllocatorTy::MemPoolTy::alloc(size_t Size, |
| size_t &AllocSize) { |
| if (Size == 0 || Size > AllocMax) |
| return nullptr; |
| |
| const uint32_t BucketId = getBucketId(Size); |
| auto &Blocks = Buckets[BucketId]; |
| void *Mem = nullptr; |
| |
| for (auto *Block : Blocks) { |
| if (Block->isFull()) |
| continue; |
| Mem = Block->alloc(); |
| assert(Mem && "Inconsistent state while allocating memory from pool"); |
| PtrToBlock.try_emplace(Mem, Block); |
| break; |
| } |
| |
| if (Mem == nullptr) { |
| const bool IsSmallAllocatable = |
| (Size <= SmallAllocMax && SmallPoolSize <= SmallPoolSizeMax); |
| const bool IsFull = (PoolSize > PoolSizeMax); |
| if (IsFull && !IsSmallAllocatable) |
| return nullptr; |
| // Bucket is empty or all blocks in the bucket are full. |
| const auto ChunkSize = BucketParams[BucketId].first; |
| const auto BlockSize = BucketParams[BucketId].second; |
| auto BaseOrErr = Allocator->allocFromL0AndLog(BlockSize, 0, AllocKind); |
| if (!BaseOrErr) |
| return BaseOrErr.takeError(); |
| |
| void *Base = *BaseOrErr; |
| if (ZeroInit) { |
| auto Err = Allocator->enqueueMemSet(Base, 0, BlockSize); |
| if (Err) |
| return Err; |
| } |
| |
| BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize); |
| Blocks.push_back(Block); |
| Mem = Block->alloc(); |
| PtrToBlock.try_emplace(Mem, Block); |
| if (IsFull) |
| SmallPoolSize += BlockSize; |
| else |
| PoolSize += BlockSize; |
| DP("New block allocation for %s pool: base = " DPxMOD |
| ", size = %zu, pool size = %zu\n", |
| AllocKindToStr(AllocKind), DPxPTR(Base), BlockSize, PoolSize); |
| BucketStats[BucketId].first++; |
| } else { |
| BucketStats[BucketId].second++; |
| } |
| |
| AllocSize = (AllocMin << BucketId); |
| |
| return Mem; |
| } |
| |
| /// Deallocate the specified memory and returns block size deallocated. |
| size_t MemAllocatorTy::MemPoolTy::dealloc(void *Ptr) { |
| if (PtrToBlock.count(Ptr) == 0) |
| return 0; |
| PtrToBlock[Ptr]->dealloc(Ptr); |
| const size_t Deallocated = PtrToBlock[Ptr]->ChunkSize; |
| PtrToBlock.erase(Ptr); |
| return Deallocated; |
| } |
| |
| void MemAllocatorTy::MemAllocInfoMapTy::add(void *Ptr, void *Base, |
| size_t ReqSize, size_t AllocSize, |
| int32_t Kind, bool InPool, |
| bool ImplicitArg) { |
| const auto Inserted = Map.emplace( |
| Ptr, MemAllocInfoTy{Base, ReqSize, AllocSize, Kind, InPool, ImplicitArg}); |
| // Check if we keep valid disjoint memory ranges. |
| [[maybe_unused]] bool Valid = Inserted.second; |
| if (Valid) { |
| if (Inserted.first != Map.begin()) { |
| const auto I = std::prev(Inserted.first, 1); |
| Valid = |
| Valid && (uintptr_t)I->first + I->second.ReqSize <= (uintptr_t)Ptr; |
| } |
| if (Valid) { |
| const auto I = std::next(Inserted.first, 1); |
| if (I != Map.end()) |
| Valid = Valid && (uintptr_t)Ptr + ReqSize <= (uintptr_t)I->first; |
| } |
| } |
| assert(Valid && "Invalid overlapping memory allocation"); |
| assert(Kind >= 0 && Kind < MaxMemKind && "Invalid target allocation kind"); |
| if (ImplicitArg) |
| NumImplicitArgs[Kind]++; |
| } |
| |
| /// Remove allocation information for the given memory location. |
| bool MemAllocatorTy::MemAllocInfoMapTy::remove(void *Ptr, |
| MemAllocInfoTy *Removed) { |
| const auto AllocInfo = Map.find(Ptr); |
| if (AllocInfo == Map.end()) |
| return false; |
| if (AllocInfo->second.ImplicitArg) |
| NumImplicitArgs[AllocInfo->second.Kind]--; |
| if (Removed) |
| *Removed = AllocInfo->second; |
| Map.erase(AllocInfo); |
| return true; |
| } |
| |
| Error MemAllocatorTy::initDevicePools(L0DeviceTy &L0Device, |
| const L0OptionsTy &Options) { |
| SupportsLargeMem = L0Device.supportsLargeMem(); |
| IsHostMem = false; |
| Device = &L0Device; |
| L0Context = &L0Device.getL0Context(); |
| for (auto Kind : {TARGET_ALLOC_DEVICE, TARGET_ALLOC_SHARED}) { |
| if (Options.MemPoolConfig[Kind].Use) { |
| std::lock_guard<std::mutex> Lock(Mtx); |
| Pools[Kind] = std::make_unique<MemPoolTy>(); |
| if (auto Err = Pools[Kind]->init(Kind, this, Options)) |
| return Err; |
| } |
| } |
| ReductionPool = std::make_unique<MemPoolTy>(); |
| if (auto Err = ReductionPool->init(this, Options)) |
| return Err; |
| CounterPool = std::make_unique<MemPoolTy>(); |
| if (auto Err = CounterPool->init(this)) |
| return Err; |
| updateMaxAllocSize(L0Device); |
| return Plugin::success(); |
| } |
| |
| Error MemAllocatorTy::initHostPool(L0ContextTy &Driver, |
| const L0OptionsTy &Option) { |
| SupportsLargeMem = Driver.supportsLargeMem(); |
| IsHostMem = true; |
| L0Context = &Driver; |
| if (Option.MemPoolConfig[TARGET_ALLOC_HOST].Use) { |
| std::lock_guard<std::mutex> Lock(Mtx); |
| Pools[TARGET_ALLOC_HOST] = std::make_unique<MemPoolTy>(); |
| if (auto Err = |
| Pools[TARGET_ALLOC_HOST]->init(TARGET_ALLOC_HOST, this, Option)) |
| return Err; |
| } |
| return Plugin::success(); |
| } |
| |
| void MemAllocatorTy::updateMaxAllocSize(L0DeviceTy &L0Device) { |
| // Update the maximum allocation size for this Allocator. |
| auto maxMemAllocSize = L0Device.getMaxMemAllocSize(); |
| |
| if (IsHostMem) { |
| // MaxAllocSize should be the minimum of all devices from the driver. |
| if (MaxAllocSize > maxMemAllocSize) { |
| MaxAllocSize = maxMemAllocSize; |
| DP("Updated MaxAllocSize for driver " DPxMOD " to %zu\n", |
| DPxPTR(L0Context), MaxAllocSize); |
| } |
| return; |
| } |
| |
| MaxAllocSize = maxMemAllocSize; |
| DP("Updated MaxAllocSize for device " DPxMOD " to %zu\n", DPxPTR(Device), |
| MaxAllocSize); |
| } |
| |
| /// Release resources and report statistics if requested. |
| Error MemAllocatorTy::deinit() { |
| if (!L0Context) |
| return Plugin::success(); |
| |
| std::lock_guard<std::mutex> Lock(Mtx); |
| if (!L0Context) |
| return Plugin::success(); |
| // Release RTL-owned memory. |
| for (auto *M : MemOwned) { |
| auto Err = deallocLocked(M); |
| if (Err) |
| return Err; |
| } |
| for (auto &Pool : Pools) { |
| if (Pool) { |
| if (auto Err = Pool->deinit()) |
| return Err; |
| Pool.reset(nullptr); |
| } |
| } |
| if (ReductionPool) { |
| if (auto Err = ReductionPool->deinit()) |
| return Err; |
| ReductionPool.reset(nullptr); |
| } |
| if (CounterPool) { |
| if (auto Err = CounterPool->deinit()) |
| return Err; |
| CounterPool.reset(nullptr); |
| } |
| // Report memory usage if requested. |
| ODBG_OS([&](llvm::raw_ostream &Os) { |
| for (size_t Kind = 0; Kind < MaxMemKind; Kind++) { |
| auto &Stat = Stats[Kind]; |
| Os << "Memory usage for " << AllocKindToStr(Kind) << ", device " << Device |
| << "\n"; |
| if (Stat.NumAllocs[0] == 0 && Stat.NumAllocs[1] == 0) { |
| Os << "-- Not used\n"; |
| continue; |
| } |
| Os << "-- Allocator: " << llvm::format("%12s", "Native") << ", " |
| << llvm::format("%12s", "Pool") << "\n"; |
| Os << "-- Requested: " << llvm::format("%12zu", Stat.Requested[0]) << ", " |
| << llvm::format("%12zu", Stat.Requested[1]) << "\n"; |
| Os << "-- Allocated: " << llvm::format("%12zu", Stat.Allocated[0]) << ", " |
| << llvm::format("%12zu", Stat.Allocated[1]) << "\n"; |
| Os << "-- Freed : " << llvm::format("%12zu", Stat.Freed[0]) << ", " |
| << llvm::format("%12zu", Stat.Freed[1]) << "\n"; |
| Os << "-- InUse : " << llvm::format("%12zu", Stat.InUse[0]) << ", " |
| << llvm::format("%12zu", Stat.InUse[1]) << "\n"; |
| Os << "-- PeakUse : " << llvm::format("%12zu", Stat.PeakUse[0]) << ", " |
| << llvm::format("%12zu", Stat.PeakUse[1]) << "\n"; |
| Os << "-- NumAllocs: " << llvm::format("%12zu", Stat.NumAllocs[0]) << ", " |
| << llvm::format("%12zu", Stat.NumAllocs[1]) << "\n"; |
| } |
| }); |
| |
| // Mark as deinitialized. |
| L0Context = nullptr; |
| return Plugin::success(); |
| } |
| |
| /// Allocate memory with the specified information. |
| Expected<void *> MemAllocatorTy::allocFromPool(size_t Size, size_t Align, |
| int32_t Kind, intptr_t Offset, |
| bool UserAlloc, bool DevMalloc, |
| uint32_t MemAdvice, |
| AllocOptionTy AllocOpt) { |
| assert((Kind == TARGET_ALLOC_DEVICE || Kind == TARGET_ALLOC_HOST || |
| Kind == TARGET_ALLOC_SHARED) && |
| "Unknown memory kind while allocating target memory"); |
| |
| std::lock_guard<std::mutex> Lock(Mtx); |
| |
| // We do not expect meaningful Align parameter when Offset > 0, so the |
| // following code does not handle such case. |
| |
| size_t AllocSize = Size + Offset; |
| void *Mem = nullptr; |
| void *AllocBase = nullptr; |
| const bool UseScratchPool = |
| (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_SCRATCH); |
| const bool UseZeroInitPool = |
| (AllocOpt == AllocOptionTy::ALLOC_OPT_REDUCTION_COUNTER); |
| const bool UseDedicatedPool = UseScratchPool || UseZeroInitPool; |
| |
| if ((Pools[Kind] && |
| MemAdvice == std::numeric_limits<decltype(MemAdvice)>::max()) || |
| UseDedicatedPool) { |
| // Pool is enabled for the allocation kind, and we do not use any memory |
| // advice. We should avoid using pool if there is any meaningful memory |
| // advice not to affect sibling allocation in the same block. |
| if (Align > 0) |
| AllocSize += (Align - 1); |
| size_t PoolAllocSize = 0; |
| MemPoolTy *Pool = nullptr; |
| |
| if (UseScratchPool) |
| Pool = ReductionPool.get(); |
| else if (UseZeroInitPool) |
| Pool = CounterPool.get(); |
| else |
| Pool = Pools[Kind].get(); |
| |
| auto PtrOrErr = Pool->alloc(AllocSize, PoolAllocSize); |
| if (!PtrOrErr) |
| return PtrOrErr.takeError(); |
| AllocBase = *PtrOrErr; |
| if (AllocBase) { |
| uintptr_t Base = (uintptr_t)AllocBase; |
| if (Align > 0) |
| Base = (Base + Align) & ~(Align - 1); |
| Mem = (void *)(Base + Offset); |
| AllocInfo.add(Mem, AllocBase, Size, PoolAllocSize, Kind, true, UserAlloc); |
| log(Size, PoolAllocSize, Kind, true /* Pool */); |
| if (DevMalloc) |
| MemOwned.push_back(AllocBase); |
| if (UseDedicatedPool) { |
| DP("Allocated %zu bytes from %s pool\n", Size, |
| UseScratchPool ? "scratch" : "zero-initialized"); |
| } |
| return Mem; |
| } |
| } |
| |
| auto AllocBaseOrErr = |
| allocFromL0AndLog(AllocSize, Align, Kind, /*ActiveSize=*/Size); |
| if (!AllocBaseOrErr) |
| return AllocBaseOrErr.takeError(); |
| AllocBase = *AllocBaseOrErr; |
| if (AllocBase) { |
| Mem = (void *)((uintptr_t)AllocBase + Offset); |
| AllocInfo.add(Mem, AllocBase, Size, AllocSize, Kind, false, UserAlloc); |
| if (DevMalloc) |
| MemOwned.push_back(AllocBase); |
| if (UseDedicatedPool) { |
| // We do not want this happen in general. |
| DP("Allocated %zu bytes from L0 for %s pool\n", Size, |
| UseScratchPool ? "scratch" : "zero-initialized"); |
| } |
| } |
| return Mem; |
| } |
| |
| /// Deallocate memory. |
| Error MemAllocatorTy::deallocLocked(void *Ptr) { |
| MemAllocInfoTy Info; |
| if (!AllocInfo.remove(Ptr, &Info)) { |
| return Plugin::error(ErrorCode::BACKEND_FAILURE, |
| "Cannot find memory allocation information for " DPxMOD |
| "\n", |
| DPxPTR(Ptr)); |
| } |
| if (Info.InPool) { |
| size_t DeallocSize = 0; |
| if (Pools[Info.Kind]) |
| DeallocSize = Pools[Info.Kind]->dealloc(Info.Base); |
| if (DeallocSize == 0) { |
| // Try reduction scratch pool. |
| DeallocSize = ReductionPool->dealloc(Info.Base); |
| // Try reduction counter pool. |
| if (DeallocSize == 0) |
| DeallocSize = CounterPool->dealloc(Info.Base); |
| if (DeallocSize == 0) { |
| return Plugin::error(ErrorCode::BACKEND_FAILURE, |
| "Cannot return memory " DPxMOD " to pool\n", |
| DPxPTR(Ptr)); |
| } |
| } |
| log(0, DeallocSize, Info.Kind, true /* Pool */); |
| return Plugin::success(); |
| } |
| if (!Info.Base) { |
| DP("Error: Cannot find base address of " DPxMOD "\n", DPxPTR(Ptr)); |
| return Plugin::error(ErrorCode::INVALID_ARGUMENT, |
| "Cannot find base address of " DPxMOD "\n", |
| DPxPTR(Ptr)); |
| } |
| log(/*NoReqSize*/ 0, Info.AllocSize, Info.Kind); |
| |
| if (auto Err = deallocFromL0(Info.Base)) |
| return Err; |
| DP("Deleted device memory " DPxMOD " (Base: " DPxMOD ", Size: %zu)\n", |
| DPxPTR(Ptr), DPxPTR(Info.Base), Info.AllocSize); |
| |
| return Plugin::success(); |
| } |
| |
| Error MemAllocatorTy::enqueueMemSet(void *Dst, int8_t Value, size_t Size) { |
| return Device->enqueueMemFill(Dst, &Value, sizeof(int8_t), Size); |
| } |
| |
| Error MemAllocatorTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size) { |
| return Device->enqueueMemCopy(Dst, Src, Size); |
| } |
| |
| Expected<void *> MemAllocatorTy::allocFromL0(size_t Size, size_t Align, |
| int32_t Kind) { |
| void *Mem = nullptr; |
| ze_device_mem_alloc_desc_t DeviceDesc{ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, |
| nullptr, 0, 0}; |
| ze_host_mem_alloc_desc_t HostDesc{ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, |
| nullptr, 0}; |
| |
| // Use relaxed allocation limit if driver supports. |
| ze_relaxed_allocation_limits_exp_desc_t RelaxedDesc{ |
| ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC, nullptr, |
| ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE}; |
| if (Size > MaxAllocSize && SupportsLargeMem) { |
| DeviceDesc.pNext = &RelaxedDesc; |
| HostDesc.pNext = &RelaxedDesc; |
| } |
| |
| auto ZeDevice = Device ? Device->getZeDevice() : nullptr; |
| auto ZeContext = L0Context->getZeContext(); |
| bool MakeResident = false; |
| switch (Kind) { |
| case TARGET_ALLOC_DEVICE: |
| MakeResident = true; |
| CALL_ZE_RET_ERROR(zeMemAllocDevice, ZeContext, &DeviceDesc, Size, Align, |
| ZeDevice, &Mem); |
| DP("Allocated %" PRId64 " bytes of device memory " DPxMOD "\n", Size, |
| DPxPTR(Mem)); |
| break; |
| case TARGET_ALLOC_HOST: |
| CALL_ZE_RET_ERROR(zeMemAllocHost, ZeContext, &HostDesc, Size, Align, &Mem); |
| DP("Allocated %" PRId64 " bytes of host memory " DPxMOD "\n", Size, |
| DPxPTR(Mem)); |
| break; |
| case TARGET_ALLOC_SHARED: |
| CALL_ZE_RET_ERROR(zeMemAllocShared, ZeContext, &DeviceDesc, &HostDesc, Size, |
| Align, ZeDevice, &Mem); |
| DP("Allocated %" PRId64 " bytes of shared memory " DPxMOD "\n", Size, |
| DPxPTR(Mem)); |
| break; |
| default: |
| assert(0 && "Invalid target data allocation kind"); |
| } |
| |
| if (MakeResident) { |
| assert(Device && |
| "Device is not set for memory allocation. Is this a Device Pool?"); |
| if (auto Err = Device->makeMemoryResident(Mem, Size)) { |
| Mem = nullptr; |
| return std::move(Err); |
| } |
| } |
| return Mem; |
| } |
| |
| Error MemAllocatorTy::deallocFromL0(void *Ptr) { |
| CALL_ZE_RET_ERROR(zeMemFree, L0Context->getZeContext(), Ptr); |
| DP("Freed device pointer " DPxMOD "\n", DPxPTR(Ptr)); |
| return Plugin::success(); |
| } |
| |
| Expected<ze_event_handle_t> EventPoolTy::getEvent() { |
| std::lock_guard<std::mutex> Lock(*Mtx); |
| |
| if (Events.empty()) { |
| // Need to create a new L0 pool. |
| ze_event_pool_desc_t Desc{ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr, 0, 0}; |
| Desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | Flags; |
| Desc.count = PoolSize; |
| ze_event_pool_handle_t Pool; |
| CALL_ZE_RET_ERROR(zeEventPoolCreate, Context, &Desc, 0, nullptr, &Pool); |
| Pools.push_back(Pool); |
| |
| // Create events. |
| ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0}; |
| EventDesc.wait = 0; |
| EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; |
| for (uint32_t I = 0; I < PoolSize; I++) { |
| EventDesc.index = I; |
| ze_event_handle_t Event; |
| CALL_ZE_RET_ERROR(zeEventCreate, Pool, &EventDesc, &Event); |
| Events.push_back(Event); |
| } |
| } |
| |
| auto Ret = Events.back(); |
| Events.pop_back(); |
| |
| return Ret; |
| } |
| |
| /// Return an event to the pool. |
| Error EventPoolTy::releaseEvent(ze_event_handle_t Event, L0DeviceTy &Device) { |
| std::lock_guard<std::mutex> Lock(*Mtx); |
| CALL_ZE_RET_ERROR(zeEventHostReset, Event); |
| Events.push_back(Event); |
| return Plugin::success(); |
| } |
| |
| } // namespace llvm::omp::target::plugin |