blob: 1edafcd04be57598737e34196323ba4634147f61 [file] [edit]
//===--- Level Zero Target RTL Implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Level Zero Queue abstraction.
//
//===----------------------------------------------------------------------===//
#include "L0Queue.h"
#include "L0Device.h"
#include "L0Kernel.h"
#include "L0Plugin.h"
#include "llvm/ADT/ScopeExit.h"
namespace llvm::omp::target::plugin {
/// common methods
Error L0QueueTy::init() {
auto CmdListOrErr = Device.getCmdListManager(CreateQueueInOrder);
if (!CmdListOrErr)
return CmdListOrErr.takeError();
CmdList = *CmdListOrErr;
return initImpl();
}
Error L0QueueTy::deinit() {
if (auto Err = deinitImpl())
return Err;
reset();
if (CmdList)
if (auto Err = Device.releaseCmdListManager(CmdList))
return Err;
CmdList = nullptr;
return Plugin::success();
}
Error L0QueueTy::dispatchLaunchKernel(ze_kernel_handle_t Kernel,
L0LaunchEnvTy &KEnv,
ze_event_handle_t SignalEvent,
uint32_t NumWaitEvents,
ze_event_handle_t *WaitEvents) {
// Unlock KEnv lock after launching the kernel.
llvm::scope_exit UnlockGuard([&KEnv]() { KEnv.Lock.unlock(); });
if (KEnv.IsPtrArg)
return CmdList->appendLaunchKernelWithArgs(
Kernel, &KEnv.GroupCounts, &KEnv.GroupSizes, KEnv.ArgPtrs, SignalEvent,
NumWaitEvents, WaitEvents, KEnv.IsCooperative);
return CmdList->appendLaunchKernel(Kernel, &KEnv.GroupCounts, SignalEvent,
NumWaitEvents, WaitEvents,
KEnv.IsCooperative);
}
// L0AsyncQueueTy implementation.
Error L0AsyncQueueTy::deinitImpl() {
Error AllErrors = Plugin::success();
for (auto &Event : WaitEvents) {
if (auto Err = Device.releaseEvent(Event))
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
}
WaitEvents.clear();
return AllErrors;
}
void L0AsyncQueueTy::resetImpl() {
WaitEvents.clear();
KernelEvent = nullptr;
H2MList.clear();
USM2MList.clear();
}
void L0AsyncQueueTy::processCopyQueues() {
auto processQueue = [](auto &Queue) {
for (auto &[Src, Dst, Size] : Queue)
std::copy_n(static_cast<const char *>(Src), Size,
static_cast<char *>(Dst));
Queue.clear();
};
processQueue(USM2MList);
processQueue(H2MList);
}
Error L0AsyncQueueTy::synchronizeImpl() {
Error SyncErrors = Plugin::success();
// Wait for all events. We should wait and reset events in reverse order
// to avoid premature event reset. If we have a kernel event in the
// queue, it is the last event to wait for since all wait events of the
// kernel are signaled before the kernel is invoked. We always invoke
// synchronization on kernel event to support printf().
bool WaitDone = false;
for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) {
if (!WaitDone) {
SyncErrors = joinErrors(std::move(SyncErrors),
CmdList->eventHostSynchronize(*Itr));
if (*Itr == KernelEvent)
WaitDone = true;
}
if (auto Err = Device.releaseEvent(*Itr))
SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
}
WaitEvents.clear();
KernelEvent = nullptr;
processCopyQueues();
return SyncErrors;
}
Expected<bool> L0AsyncQueueTy::hasPendingWorkImpl() {
if (!WaitEvents.empty())
return true;
processCopyQueues();
return false;
}
std::tuple<size_t, ze_event_handle_t *> L0AsyncQueueTy::getMemCopyEvents() {
return KernelEvent ? std::make_tuple(1, &KernelEvent)
: std::make_tuple(0, nullptr);
}
std::tuple<size_t, ze_event_handle_t *>
L0AsyncQueueTy::getLaunchKernelEvents() {
return WaitEvents.empty()
? std::make_tuple(0, nullptr)
: std::make_tuple(WaitEvents.size(), WaitEvents.data());
}
Error L0AsyncQueueTy::memoryCopyImpl(void *Dst, const void *Src, size_t Size) {
auto EventOrErr = Device.getEvent();
if (!EventOrErr)
return EventOrErr.takeError();
ze_event_handle_t SignalEvent = *EventOrErr;
auto [NumWaitEvents, WaitEventsPtr] = getMemCopyEvents();
Error AllErrors = CmdList->appendMemoryCopy(Dst, Src, Size, SignalEvent,
NumWaitEvents, WaitEventsPtr);
if (!AllErrors) {
WaitEvents.push_back(SignalEvent);
} else {
if (auto Err = Device.releaseEvent(SignalEvent))
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
}
return AllErrors;
}
Error L0AsyncQueueTy::dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
int64_t Size) {
auto TgtPtrType = Device.getMemAllocType(TgtPtr);
if (TgtPtrType == ZE_MEMORY_TYPE_HOST ||
TgtPtrType == ZE_MEMORY_TYPE_SHARED) {
bool CopyNow = true;
if (KernelEvent) {
// Delay Host/Shared USM to host memory copy since it must wait for
// kernel completion.
USM2MList.emplace_back(
PendingCopyDescTy{TgtPtr, HstPtr, static_cast<size_t>(Size)});
CopyNow = false;
}
if (CopyNow) {
std::copy_n(static_cast<const char *>(TgtPtr), Size,
static_cast<char *>(HstPtr));
}
return Plugin::success();
}
void *DstPtr = HstPtr;
if (Device.isDiscreteDevice() &&
static_cast<size_t>(Size) <=
Device.getPlugin().getOptions().StagingBufferSize &&
Device.getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
auto PtrOrErr = Device.getStagingBuffer().get(/*IsAsync*/ true);
if (!PtrOrErr)
return PtrOrErr.takeError();
DstPtr = *PtrOrErr;
}
if (auto Err = memoryCopy(DstPtr, TgtPtr, Size))
return Err;
if (DstPtr != HstPtr)
H2MList.emplace_back(
PendingCopyDescTy{DstPtr, HstPtr, static_cast<size_t>(Size)});
return Plugin::success();
}
Error L0AsyncQueueTy::dataSubmitImpl(void *TgtPtr, const void *HstPtr,
int64_t Size) {
const auto TgtPtrType = Device.getMemAllocType(TgtPtr);
if (TgtPtrType == ZE_MEMORY_TYPE_SHARED ||
TgtPtrType == ZE_MEMORY_TYPE_HOST) {
std::copy_n(static_cast<const char *>(HstPtr), Size,
static_cast<char *>(TgtPtr));
return Plugin::success();
}
const void *SrcPtr = HstPtr;
if (Device.isDiscreteDevice() &&
static_cast<size_t>(Size) <=
Device.getPlugin().getOptions().StagingBufferSize &&
Device.getMemAllocType(HstPtr) != ZE_MEMORY_TYPE_HOST) {
auto PtrOrErr = Device.getStagingBuffer().get(/*IsAsync*/ true);
if (!PtrOrErr)
return PtrOrErr.takeError();
SrcPtr = *PtrOrErr;
std::copy_n(static_cast<const char *>(HstPtr), Size,
static_cast<char *>(const_cast<void *>(SrcPtr)));
}
return memoryCopy(TgtPtr, SrcPtr, Size);
}
Error L0AsyncQueueTy::dataFenceImpl() {
return CmdList->appendBarrier(/*SignalEvent*/ nullptr, /*NumWaitEvents*/ 0,
/*WaitEvents*/ nullptr);
}
Error L0AsyncQueueTy::launchKernelImpl(ze_kernel_handle_t Kernel,
L0LaunchEnvTy &KEnv) {
auto EventOrError = Device.getEvent();
if (!EventOrError)
return EventOrError.takeError();
ze_event_handle_t SignalEvent = *EventOrError;
auto [NumWaitEvents, WaitEventsPtr] = getLaunchKernelEvents();
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
"Kernel depends on %zu data copying events.\n", NumWaitEvents);
Error AllErrors = dispatchLaunchKernel(Kernel, KEnv, SignalEvent,
NumWaitEvents, WaitEventsPtr);
if (AllErrors) {
if (auto Err = Device.releaseEvent(SignalEvent))
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
return AllErrors;
}
WaitEvents.push_back(SignalEvent);
KernelEvent = SignalEvent;
return Plugin::success();
}
Error L0AsyncQueueTy::memoryFillImpl(void *Ptr, const void *Pattern,
size_t PatternSize, size_t Size) {
auto EventOrErr = Device.getEvent();
if (!EventOrErr)
return EventOrErr.takeError();
auto [NumWaitEvents, WaitEventsPtr] = getMemCopyEvents();
ze_event_handle_t SignalEvent = *EventOrErr;
if (auto Err = CmdList->appendMemoryFill(Ptr, Pattern, PatternSize, Size,
SignalEvent, NumWaitEvents,
WaitEventsPtr)) {
if (auto ReleaseErr = Device.releaseEvent(SignalEvent))
return joinErrors(std::move(Err), std::move(ReleaseErr));
return Err;
}
WaitEvents.push_back(SignalEvent);
return Plugin::success();
}
// L0AsyncOrderedQueue implementation.
Error L0AsyncOrderedQueueTy::synchronizeImpl() {
Error SyncErrors = Plugin::success();
ze_event_handle_t LastEvent =
WaitEvents.empty() ? nullptr : WaitEvents.back();
// Only need to wait for the last event.
if (LastEvent) {
SyncErrors = joinErrors(std::move(SyncErrors),
CmdList->eventHostSynchronize(LastEvent));
}
// Synchronize on kernel event to support printf().
ze_event_handle_t KE = KernelEvent;
if (KE && KE != LastEvent && !SyncErrors) {
SyncErrors =
joinErrors(std::move(SyncErrors), CmdList->eventHostSynchronize(KE));
}
for (auto &Event : WaitEvents) {
if (auto Err = Device.releaseEvent(Event))
SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
}
processCopyQueues();
WaitEvents.clear();
KernelEvent = nullptr;
return SyncErrors;
}
std::tuple<size_t, ze_event_handle_t *>
L0AsyncOrderedQueueTy::getMemCopyEvents() {
return WaitEvents.empty() ? std::make_tuple(0, nullptr)
: std::make_tuple(1, &WaitEvents.back());
}
std::tuple<size_t, ze_event_handle_t *>
L0AsyncOrderedQueueTy::getLaunchKernelEvents() {
return WaitEvents.empty() ? std::make_tuple(0, nullptr)
: std::make_tuple(1, &WaitEvents.back());
}
// L0InorderQueueTy implementation.
Error L0InorderQueueTy::synchronizeImpl() { return CmdList->hostSynchronize(); }
Expected<bool> L0InorderQueueTy::hasPendingWorkImpl() {
return CmdList->queryPendingWork();
}
Error L0InorderQueueTy::memoryCopyImpl(void *Dst, const void *Src,
size_t Size) {
return CmdList->appendMemoryCopy(Dst, Src, Size);
}
Error L0InorderQueueTy::launchKernelImpl(ze_kernel_handle_t Kernel,
L0LaunchEnvTy &KEnv) {
return dispatchLaunchKernel(Kernel, KEnv);
}
// L0SyncQueueTy implementation.
Error L0SyncQueueTy::memoryCopyImpl(void *Dst, const void *Src, size_t Size) {
if (auto Err = L0InorderQueueTy::memoryCopyImpl(Dst, Src, Size))
return Err;
return CmdList->hostSynchronize();
}
Error L0SyncQueueTy::launchKernelImpl(ze_kernel_handle_t Kernel,
L0LaunchEnvTy &KEnv) {
if (auto Err = L0InorderQueueTy::launchKernelImpl(Kernel, KEnv))
return Err;
return CmdList->hostSynchronize();
}
// L0QueueCache implementation.
Expected<L0QueueTy *> L0QueueCacheTy::getQueue() {
{
std::lock_guard<std::mutex> Lock(Mtx);
if (!Queues.empty()) {
L0QueueTy *Queue = Queues.back();
Queues.pop_back();
return Queue;
}
}
L0QueueTy *Queue = nullptr;
switch (CachedCmdMode) {
case CommandModeTy::Async:
Queue = new L0AsyncQueueTy(Device);
break;
case CommandModeTy::AsyncOrdered:
Queue = new L0AsyncOrderedQueueTy(Device);
break;
case CommandModeTy::Sync:
Queue = new L0SyncQueueTy(Device);
break;
case CommandModeTy::InOrder:
Queue = new L0InorderQueueTy(Device);
break;
default:
return Plugin::error(ErrorCode::UNIMPLEMENTED, "Unsupported command mode");
}
if (auto Err = Queue->init()) {
delete Queue;
return std::move(Err);
}
return Queue;
}
void L0QueueCacheTy::releaseQueue(L0QueueTy *Queue) {
if (!Queue)
return;
Queue->reset();
std::lock_guard<std::mutex> Lock(Mtx);
Queues.push_back(Queue);
}
Error L0QueueCacheTy::deinit() {
Error AllErrors = Error::success();
std::lock_guard<std::mutex> Lock(Mtx);
for (auto *Queue : Queues) {
if (auto Err = Queue->deinit())
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
delete Queue;
}
Queues.clear();
return AllErrors;
}
} // namespace llvm::omp::target::plugin