blob: dc884e3b753dcde0076eef60c418f59299af5fd1 [file]
//===--- Level Zero Target RTL Implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// GenericKernel implementation for SPIR-V/Xe machine.
//
//===----------------------------------------------------------------------===//
#include "L0Kernel.h"
#include "L0Device.h"
#include "L0Plugin.h"
#include "L0Program.h"
#include "llvm/ADT/ScopeExit.h"
namespace llvm::omp::target::plugin {
Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) {
const auto &l0Device = L0DeviceTy::makeL0Device(Program.getDevice());
auto &KernelPR = getProperties();
ze_kernel_properties_t KP = {};
KP.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES;
KP.pNext = nullptr;
ze_kernel_preferred_group_size_properties_t KPrefGRPSize = {};
KPrefGRPSize.stype = ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES;
KPrefGRPSize.pNext = nullptr;
if (l0Device.getDriverAPIVersion() >= ZE_API_VERSION_1_2)
KP.pNext = &KPrefGRPSize;
CALL_ZE_RET_ERROR(zeKernelGetProperties, zeKernel, &KP);
KernelPR.SIMDWidth = KP.maxSubgroupSize;
KernelPR.Width = KP.maxSubgroupSize;
KernelPR.NumKernelArgs = KP.numKernelArgs;
if (KP.pNext)
KernelPR.Width = KPrefGRPSize.preferredMultiple;
if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen)) {
KernelPR.Width = (std::max)(KernelPR.Width, 2 * KernelPR.SIMDWidth);
}
KernelPR.MaxThreadGroupSize = KP.maxSubgroupSize * KP.maxNumSubgroups;
// Query and cache argument sizes if extension is available.
auto &Context = l0Device.getL0Context();
if (KernelPR.NumKernelArgs > 0 && Context.zexKernelGetArgumentSize) {
KernelPR.ArgSizes = std::make_unique<uint32_t[]>(KernelPR.NumKernelArgs);
for (uint32_t I = 0; I < KernelPR.NumKernelArgs; I++) {
CALL_ZE_RET_ERROR(Context.zexKernelGetArgumentSize, zeKernel, I,
&KernelPR.ArgSizes[I]);
}
}
return Plugin::success();
}
Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
const auto *KernelName = getName();
auto Module = Program.findModuleFromKernelName(KernelName);
if (!Module)
return Plugin::error(ErrorCode::NOT_FOUND,
"kernel '%s' not found in the program", KernelName);
ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
KernelName};
CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
if (auto Err = readKernelProperties(Program))
return Err;
return Plugin::success();
}
Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
DeviceImageTy &Image) {
auto &Program = L0ProgramTy::makeL0Program(Image);
if (auto Err = buildKernel(Program))
return Err;
Program.addKernel(this);
return Plugin::success();
}
static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
ze_kernel_handle_t zeKernel,
L0LaunchEnvTy &KEnv,
CommandModeTy CommandMode) {
const auto DeviceId = l0Device.getDeviceId();
auto *IdStr = l0Device.getZeIdCStr();
auto CmdListOrErr = l0Device.getImmCmdList();
if (!CmdListOrErr)
return CmdListOrErr.takeError();
const ze_command_list_handle_t CmdList = *CmdListOrErr;
// Command queue is not used with immediate command list.
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Using immediate command list for kernel submission.\n");
auto EventOrError = l0Device.getEvent();
if (!EventOrError)
return EventOrError.takeError();
ze_event_handle_t Event = *EventOrError;
size_t NumWaitEvents = 0;
ze_event_handle_t *WaitEvents = nullptr;
auto *AsyncQueue = KEnv.AsyncQueue;
if (KEnv.IsAsync && !AsyncQueue->WaitEvents.empty()) {
if (CommandMode == CommandModeTy::AsyncOrdered) {
NumWaitEvents = 1;
WaitEvents = &AsyncQueue->WaitEvents.back();
} else {
NumWaitEvents = AsyncQueue->WaitEvents.size();
WaitEvents = AsyncQueue->WaitEvents.data();
}
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Kernel depends on %zu data copying events.\n", NumWaitEvents);
Error AllErrors = Error::success();
if (KEnv.IsCooperative) {
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Launching cooperative kernel " DPxMOD "\n", DPxPTR(zeKernel));
CALL_ZE_ACCUM_ERROR(AllErrors, zeCommandListAppendLaunchCooperativeKernel,
CmdList, zeKernel, &KEnv.GroupCounts, Event,
NumWaitEvents, WaitEvents);
} else {
CALL_ZE_ACCUM_ERROR(AllErrors, zeCommandListAppendLaunchKernel, CmdList,
zeKernel, &KEnv.GroupCounts, Event, NumWaitEvents,
WaitEvents);
}
KEnv.Lock.unlock();
if (AllErrors) {
if (auto Err = l0Device.releaseEvent(Event))
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
return AllErrors;
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
if (KEnv.IsAsync) {
AsyncQueue->WaitEvents.push_back(Event);
AsyncQueue->KernelEvent = Event;
} else {
CALL_ZE_ACCUM_ERROR(AllErrors, zeEventHostSynchronize, Event,
L0DefaultTimeout);
if (auto Err = l0Device.releaseEvent(Event))
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
if (AllErrors)
return AllErrors;
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
IdStr);
return Plugin::success();
}
Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
uint32_t NumThreads[3],
uint32_t NumBlocks[3]) const {
assert(NumThreads[0] > 0 && NumThreads[1] > 0 && NumThreads[2] > 0 &&
"Pre-computed ThreadLimit values must be non-zero");
assert(NumBlocks[0] > 0 && NumBlocks[1] > 0 && NumBlocks[2] > 0 &&
"Pre-computed NumTeams values must be non-zero");
uint32_t GroupSizes[3];
KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]};
// Respect max group size attribute in the kernel.
uint32_t MaxGroupSize = KEnv.KernelPR.MaxThreadGroupSize;
GroupSizes[0] = std::min<uint32_t>(MaxGroupSize, NumThreads[0]);
GroupSizes[1] = std::min<uint32_t>(MaxGroupSize, NumThreads[1]);
GroupSizes[2] = std::min<uint32_t>(MaxGroupSize, NumThreads[2]);
auto DeviceId = l0Device.getDeviceId();
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
GroupSizes[1], GroupSizes[2]);
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY,
KEnv.GroupCounts.groupCountZ);
CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0],
GroupSizes[1], GroupSizes[2]);
return Plugin::success();
}
Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
L0LaunchEnvTy &KEnv) const {
// Set Kernel Indirect flags.
ze_kernel_indirect_access_flags_t Flags = 0;
Flags |= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
Flags |= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
if (KEnv.KernelPR.IndirectAccessFlags != Flags) {
// Combine with common access flags.
const auto FinalFlags = l0Device.getIndirectFlags() | Flags;
CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, zeKernel, FinalFlags);
ODBG(OLDT_Kernel) << "Setting indirect access flags "
<< reinterpret_cast<void *>(FinalFlags);
KEnv.KernelPR.IndirectAccessFlags = Flags;
}
return Plugin::success();
}
Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
if (DynBlockMemSize > 0)
return Plugin::error(ErrorCode::UNSUPPORTED,
"dynamic shared memory is unsupported in L0 plugin");
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
__tgt_async_info *AsyncInfo = AsyncInfoWrapper;
auto zeKernel = getZeKernel();
auto DeviceId = l0Device.getDeviceId();
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n",
DPxPTR(zeKernel));
auto &Plugin = l0Device.getPlugin();
auto *IdStr = l0Device.getZeIdCStr();
auto &Options = Plugin.getOptions();
bool IsAsync = AsyncInfo && l0Device.asyncEnabled();
bool IsCooperative = KernelArgs.Flags.Cooperative;
if (IsCooperative && !l0Device.supportsCooperativeKernels()) {
return Plugin::error(
ErrorCode::UNSUPPORTED,
"cooperative kernel launch is not supported by the device");
}
if (IsAsync && !AsyncInfo->Queue) {
AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
if (!AsyncInfo->Queue)
IsAsync = false; // Couldn't get a queue, revert to sync.
}
auto *AsyncQueue =
IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
auto &KernelPR = getProperties();
L0LaunchEnvTy KEnv(IsAsync, IsCooperative, AsyncQueue, KernelPR);
// Protect from kernel preparation to submission as kernels are shared.
KEnv.Lock.lock();
if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
return Err;
// Validate cooperative kernel launch constraints
if (IsCooperative) {
uint32_t MaxCooperativeGroupCount = 0;
CALL_ZE_RET_ERROR(zeKernelSuggestMaxCooperativeGroupCount, zeKernel,
&MaxCooperativeGroupCount);
uint32_t TotalGroupCount = KEnv.GroupCounts.groupCountX *
KEnv.GroupCounts.groupCountY *
KEnv.GroupCounts.groupCountZ;
if (TotalGroupCount > MaxCooperativeGroupCount) {
KernelPR.Mtx.unlock();
return Plugin::error(
ErrorCode::INVALID_ARGUMENT,
"cooperative kernel launch failed: requested %u groups exceeds "
"maximum %u cooperative groups supported by device",
TotalGroupCount, MaxCooperativeGroupCount);
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Cooperative kernel validated: using %u groups (max: %u)\n",
TotalGroupCount, MaxCooperativeGroupCount);
}
// Set kernel arguments.
uint32_t NumKernelArgs = KernelPR.NumKernelArgs;
if (NumKernelArgs > 0) {
if (!KernelPR.ArgSizes)
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"level zero plugin requires kernel argument sizes.");
// Use sizes from kernel properties.
// TODO: This is temporary workaround it will not work if there is
// padding/alignment between arguments.
char *Arg = static_cast<char *>(LaunchParams.Data);
for (uint32_t I = 0; I < NumKernelArgs; I++) {
uint32_t ArgSize = KernelPR.ArgSizes[I];
CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, ArgSize, Arg);
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Kernel Pointer argument %" PRIu32 " (value: " DPxMOD
") was set successfully for device %s.\n",
I, DPxPTR(Arg), IdStr);
Arg += ArgSize;
}
}
if (auto Err = setIndirectFlags(l0Device, KEnv))
return Err;
// The next call should unlock the KernelLock internally.
return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv,
Options.CommandMode);
}
Expected<uint32_t>
L0KernelTy::getMaxCooperativeGroupCount(GenericDeviceTy &GenericDevice,
const uint32_t NumThreads[3],
uint32_t DynBlockMemSize) const {
ze_result_t Res = zeKernelSetGroupSize(zeKernel, NumThreads[0], NumThreads[1],
NumThreads[2]);
if (Res != ZE_RESULT_SUCCESS)
return Plugin::error(ErrorCode::UNSUPPORTED,
"failed to set group size for cooperative launch");
uint32_t MaxCooperativeGroupCount = 0;
Res = zeKernelSuggestMaxCooperativeGroupCount(zeKernel,
&MaxCooperativeGroupCount);
if (Res != ZE_RESULT_SUCCESS)
return Plugin::error(ErrorCode::UNSUPPORTED,
"failed to query max cooperative group count");
return MaxCooperativeGroupCount;
}
} // namespace llvm::omp::target::plugin