blob: e6d7bba305fd8e46cdfad6b2e6cfc488ca4fc568 [file] [log] [blame] [edit]
//===--- Level Zero Target RTL Implementation -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// GenericKernel implementation for SPIR-V/Xe machine.
//
//===----------------------------------------------------------------------===//
#include "L0Kernel.h"
#include "L0Device.h"
#include "L0Plugin.h"
#include "L0Program.h"
namespace llvm::omp::target::plugin {
bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn,
const int32_t ThreadLimitIn,
uint32_t *GroupSizesOut,
L0LaunchEnvTy &KEnv) const {
if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
return false;
// Found matching input parameters.
std::copy_n(GroupSizes, 3, GroupSizesOut);
KEnv.GroupCounts = GroupCounts;
return true;
}
void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn,
const int32_t ThreadLimitIn,
const uint32_t *GroupSizesIn,
L0LaunchEnvTy &KEnv) {
NumTeams = NumTeamsIn;
ThreadLimit = ThreadLimitIn;
std::copy_n(GroupSizesIn, 3, GroupSizes);
GroupCounts = KEnv.GroupCounts;
}
Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) {
const auto &l0Device = L0DeviceTy::makeL0Device(Program.getDevice());
auto &KernelPR = getProperties();
ze_kernel_properties_t KP = {};
KP.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES;
KP.pNext = nullptr;
ze_kernel_preferred_group_size_properties_t KPrefGRPSize = {};
KPrefGRPSize.stype = ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES;
KPrefGRPSize.pNext = nullptr;
if (l0Device.getDriverAPIVersion() >= ZE_API_VERSION_1_2)
KP.pNext = &KPrefGRPSize;
CALL_ZE_RET_ERROR(zeKernelGetProperties, zeKernel, &KP);
KernelPR.SIMDWidth = KP.maxSubgroupSize;
KernelPR.Width = KP.maxSubgroupSize;
if (KP.pNext)
KernelPR.Width = KPrefGRPSize.preferredMultiple;
if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen)) {
KernelPR.Width = (std::max)(KernelPR.Width, 2 * KernelPR.SIMDWidth);
}
KernelPR.MaxThreadGroupSize = KP.maxSubgroupSize * KP.maxNumSubgroups;
return Plugin::success();
}
Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
const auto *KernelName = getName();
auto Module = Program.findModuleFromKernelName(KernelName);
ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
KernelName};
CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
if (auto Err = readKernelProperties(Program))
return Err;
return Plugin::success();
}
Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
DeviceImageTy &Image) {
auto &Program = L0ProgramTy::makeL0Program(Image);
if (auto Err = buildKernel(Program))
return Err;
Program.addKernel(this);
return Plugin::success();
}
void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device,
uint32_t NumTeams,
uint32_t ThreadLimit,
uint32_t *GroupSizes,
L0LaunchEnvTy &KEnv) const {
const KernelPropertiesTy &KernelPR = getProperties();
const auto DeviceId = Device.getDeviceId();
bool MaxGroupSizeForced = false;
bool MaxGroupCountForced = false;
uint32_t MaxGroupSize = Device.getMaxGroupSize();
const auto &Option = LevelZeroPluginTy::getOptions();
const auto OptSubscRate = Option.SubscriptionRate;
auto &GroupCounts = KEnv.GroupCounts;
uint32_t SIMDWidth = KernelPR.SIMDWidth;
uint32_t KernelWidth = KernelPR.Width;
uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
if (KernelMaxThreadGroupSize < MaxGroupSize) {
MaxGroupSize = KernelMaxThreadGroupSize;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Capping maximum team size to %" PRIu32
" due to kernel constraints.\n",
MaxGroupSize);
}
if (ThreadLimit > 0) {
MaxGroupSizeForced = true;
MaxGroupSize = ThreadLimit;
}
uint32_t MaxGroupCount = 0;
if (NumTeams > 0) {
MaxGroupCount = NumTeams;
MaxGroupCountForced = true;
}
if (MaxGroupCountForced) {
// If number of teams is specified by the user, then use KernelWidth.
// WIs per WG by default, so that it matches
// decideLoopKernelGroupArguments() behavior.
if (!MaxGroupSizeForced) {
MaxGroupSize = KernelWidth;
}
} else {
const uint32_t NumSubslices = Device.getNumSubslices();
uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
if (KEnv.HalfNumThreads)
NumThreadsPerSubslice /= 2;
MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
if (MaxGroupSizeForced) {
// Set group size for the HW capacity.
uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
uint32_t NumGroupsPerSubslice =
(NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
} else {
assert(!MaxGroupSizeForced && !MaxGroupCountForced);
assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
"Invalid maxGroupSize");
// Maximize group size.
while (MaxGroupSize >= KernelWidth) {
uint32_t NumThreadsPerGroup =
(MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
uint32_t NumGroupsPerSubslice =
NumThreadsPerSubslice / NumThreadsPerGroup;
MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
break;
}
MaxGroupSize -= KernelWidth;
}
}
}
uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
if (!MaxGroupCountForced) {
GRPCounts[0] *= OptSubscRate;
}
GroupCounts.groupCountX = GRPCounts[0];
GroupCounts.groupCountY = GRPCounts[1];
GroupCounts.groupCountZ = GRPCounts[2];
std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
}
Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
int32_t ThreadLimit, uint32_t *GroupSizes,
L0LaunchEnvTy &KEnv) const {
const auto DeviceId = Device.getDeviceId();
const auto &KernelPR = getProperties();
// Read the most recent global thread limit and max teams.
const int32_t NumTeamsICV = 0;
const int32_t ThreadLimitICV = 0;
bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
KEnv.HalfNumThreads =
LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG;
uint32_t KernelWidth = KernelPR.Width;
uint32_t SIMDWidth = KernelPR.SIMDWidth;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
if (ThreadLimit > 0) {
// use thread_limit clause value default.
DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
ThreadLimit);
} else if (ThreadLimitICV > 0) {
// else use thread-limit-var ICV.
ThreadLimit = ThreadLimitICV;
DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
}
size_t MaxThreadLimit = Device.getMaxGroupSize();
// Set correct max group size if the kernel was compiled with explicit SIMD.
if (SIMDWidth == 1)
MaxThreadLimit = Device.getNumThreadsPerSubslice();
if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
MaxThreadLimit = KernelPR.MaxThreadGroupSize;
DP("Capping maximum team size to %zu due to kernel constraints.\n",
MaxThreadLimit);
}
if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
ThreadLimit = MaxThreadLimit;
DP("Max team size execceds current maximum %zu. Adjusted\n",
MaxThreadLimit);
}
// scope code to ease integration with downstream custom code.
{
if (NumTeams > 0) {
DP("Number of teams is set to %" PRId32
"(num_teams clause or no teams construct)\n",
NumTeams);
} else if (NumTeamsICV > 0) {
// OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
NumTeams = NumTeamsICV;
DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
NumTeams);
}
decideKernelGroupArguments(Device, (uint32_t)NumTeams,
(uint32_t)ThreadLimit, GroupSizes, KEnv);
}
return Plugin::success();
}
static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
ze_kernel_handle_t zeKernel,
L0LaunchEnvTy &KEnv,
CommandModeTy CommandMode) {
const auto DeviceId = l0Device.getDeviceId();
auto *IdStr = l0Device.getZeIdCStr();
auto CmdListOrErr = l0Device.getImmCmdList();
if (!CmdListOrErr)
return CmdListOrErr.takeError();
const ze_command_list_handle_t CmdList = *CmdListOrErr;
// Command queue is not used with immediate command list.
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Using immediate command list for kernel submission.\n");
auto EventOrError = l0Device.getEvent();
if (!EventOrError)
return EventOrError.takeError();
ze_event_handle_t Event = *EventOrError;
size_t NumWaitEvents = 0;
ze_event_handle_t *WaitEvents = nullptr;
auto *AsyncQueue = KEnv.AsyncQueue;
if (KEnv.IsAsync && !AsyncQueue->WaitEvents.empty()) {
if (CommandMode == CommandModeTy::AsyncOrdered) {
NumWaitEvents = 1;
WaitEvents = &AsyncQueue->WaitEvents.back();
} else {
NumWaitEvents = AsyncQueue->WaitEvents.size();
WaitEvents = AsyncQueue->WaitEvents.data();
}
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Kernel depends on %zu data copying events.\n", NumWaitEvents);
CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
&KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
KEnv.KernelPR.Mtx.unlock();
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
if (KEnv.IsAsync) {
AsyncQueue->WaitEvents.push_back(Event);
AsyncQueue->KernelEvent = Event;
} else {
CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
if (auto Err = l0Device.releaseEvent(Event))
return Err;
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
IdStr);
return Plugin::success();
}
static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
ze_kernel_handle_t zeKernel,
L0LaunchEnvTy &KEnv) {
const auto DeviceId = l0Device.getDeviceId();
const auto *IdStr = l0Device.getZeIdCStr();
auto CmdListOrErr = l0Device.getCmdList();
if (!CmdListOrErr)
return CmdListOrErr.takeError();
ze_command_list_handle_t CmdList = *CmdListOrErr;
auto CmdQueueOrErr = l0Device.getCmdQueue();
if (!CmdQueueOrErr)
return CmdQueueOrErr.takeError();
const ze_command_queue_handle_t CmdQueue = *CmdQueueOrErr;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Using regular command list for kernel submission.\n");
ze_event_handle_t Event = nullptr;
CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
&KEnv.GroupCounts, Event, 0, nullptr);
KEnv.KernelPR.Mtx.unlock();
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
CmdQueue, 1, &CmdList, nullptr);
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
if (Event) {
if (auto Err = l0Device.releaseEvent(Event))
return Err;
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
IdStr);
return Plugin::success();
}
Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
uint32_t NumThreads[3],
uint32_t NumBlocks[3]) const {
if (KernelEnvironment.Configuration.ExecMode != OMP_TGT_EXEC_MODE_BARE) {
// For non-bare mode, the groups are already set in the launch.
KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]};
CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), NumThreads[0],
NumThreads[1], NumThreads[2]);
return Plugin::success();
}
int32_t NumTeams = NumBlocks[0];
int32_t ThreadLimit = NumThreads[0];
if (NumTeams < 0)
NumTeams = 0;
if (ThreadLimit < 0)
ThreadLimit = 0;
uint32_t GroupSizes[3];
auto DeviceId = l0Device.getDeviceId();
auto &KernelPR = KEnv.KernelPR;
// Check if we can reuse previous group parameters.
bool GroupParamsReused =
KernelPR.reuseGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);
if (!GroupParamsReused) {
if (auto Err =
getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes, KEnv))
return Err;
KernelPR.cacheGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
GroupSizes[1], GroupSizes[2]);
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY,
KEnv.GroupCounts.groupCountZ);
if (!GroupParamsReused) {
CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0],
GroupSizes[1], GroupSizes[2]);
}
return Plugin::success();
}
Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
L0LaunchEnvTy &KEnv) const {
// Set Kernel Indirect flags.
ze_kernel_indirect_access_flags_t Flags = 0;
Flags |= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
Flags |= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();
if (KEnv.KernelPR.IndirectAccessFlags != Flags) {
// Combine with common access flags.
const auto FinalFlags = l0Device.getIndirectFlags() | Flags;
CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, zeKernel, FinalFlags);
DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
KEnv.KernelPR.IndirectAccessFlags = Flags;
}
return Plugin::success();
}
Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
__tgt_async_info *AsyncInfo = AsyncInfoWrapper;
auto zeKernel = getZeKernel();
auto DeviceId = l0Device.getDeviceId();
int32_t NumArgs = KernelArgs.NumArgs;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n",
DPxPTR(zeKernel));
auto &Plugin = l0Device.getPlugin();
auto *IdStr = l0Device.getZeIdCStr();
auto &Options = LevelZeroPluginTy::getOptions();
bool IsAsync = AsyncInfo && l0Device.asyncEnabled();
if (IsAsync && !AsyncInfo->Queue) {
AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
if (!AsyncInfo->Queue)
IsAsync = false; // Couldn't get a queue, revert to sync.
}
auto *AsyncQueue =
IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
auto &KernelPR = getProperties();
L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);
// Protect from kernel preparation to submission as kernels are shared.
KernelPR.Mtx.lock();
if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
return Err;
// Set kernel arguments.
for (int32_t I = 0; I < NumArgs; I++) {
// Scope code to ease integration with downstream custom code.
{
void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
Arg == nullptr ? nullptr : &Arg);
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Kernel Pointer argument %" PRId32 " (value: " DPxMOD
") was set successfully for device %s.\n",
I, DPxPTR(Arg), IdStr);
}
}
if (auto Err = setIndirectFlags(l0Device, KEnv))
return Err;
// The next calls should unlock the KernelLock internally.
const bool UseImmCmdList = l0Device.useImmForCompute();
if (UseImmCmdList)
return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv,
Options.CommandMode);
return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv);
}
} // namespace llvm::omp::target::plugin