| //===--- Level Zero Target RTL Implementation -----------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // GenericKernel implementation for SPIR-V/Xe machine. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "L0Kernel.h" |
| #include "L0Device.h" |
| #include "L0Plugin.h" |
| #include "L0Program.h" |
| |
| namespace llvm::omp::target::plugin { |
| |
| bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn, |
| const int32_t ThreadLimitIn, |
| uint32_t *GroupSizesOut, |
| L0LaunchEnvTy &KEnv) const { |
| if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit) |
| return false; |
| // Found matching input parameters. |
| std::copy_n(GroupSizes, 3, GroupSizesOut); |
| KEnv.GroupCounts = GroupCounts; |
| return true; |
| } |
| |
| void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn, |
| const int32_t ThreadLimitIn, |
| const uint32_t *GroupSizesIn, |
| L0LaunchEnvTy &KEnv) { |
| NumTeams = NumTeamsIn; |
| ThreadLimit = ThreadLimitIn; |
| std::copy_n(GroupSizesIn, 3, GroupSizes); |
| GroupCounts = KEnv.GroupCounts; |
| } |
| |
| Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) { |
| const auto &l0Device = L0DeviceTy::makeL0Device(Program.getDevice()); |
| auto &KernelPR = getProperties(); |
| ze_kernel_properties_t KP = {}; |
| KP.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES; |
| KP.pNext = nullptr; |
| ze_kernel_preferred_group_size_properties_t KPrefGRPSize = {}; |
| KPrefGRPSize.stype = ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES; |
| KPrefGRPSize.pNext = nullptr; |
| if (l0Device.getDriverAPIVersion() >= ZE_API_VERSION_1_2) |
| KP.pNext = &KPrefGRPSize; |
| |
| CALL_ZE_RET_ERROR(zeKernelGetProperties, zeKernel, &KP); |
| KernelPR.SIMDWidth = KP.maxSubgroupSize; |
| KernelPR.Width = KP.maxSubgroupSize; |
| |
| if (KP.pNext) |
| KernelPR.Width = KPrefGRPSize.preferredMultiple; |
| |
| if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen)) { |
| KernelPR.Width = (std::max)(KernelPR.Width, 2 * KernelPR.SIMDWidth); |
| } |
| KernelPR.MaxThreadGroupSize = KP.maxSubgroupSize * KP.maxNumSubgroups; |
| return Plugin::success(); |
| } |
| |
| Error L0KernelTy::buildKernel(L0ProgramTy &Program) { |
| const auto *KernelName = getName(); |
| |
| auto Module = Program.findModuleFromKernelName(KernelName); |
| ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, |
| KernelName}; |
| CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel); |
| if (auto Err = readKernelProperties(Program)) |
| return Err; |
| |
| return Plugin::success(); |
| } |
| |
| Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice, |
| DeviceImageTy &Image) { |
| auto &Program = L0ProgramTy::makeL0Program(Image); |
| |
| if (auto Err = buildKernel(Program)) |
| return Err; |
| Program.addKernel(this); |
| |
| return Plugin::success(); |
| } |
| |
| void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device, |
| uint32_t NumTeams, |
| uint32_t ThreadLimit, |
| uint32_t *GroupSizes, |
| L0LaunchEnvTy &KEnv) const { |
| |
| const KernelPropertiesTy &KernelPR = getProperties(); |
| |
| const auto DeviceId = Device.getDeviceId(); |
| bool MaxGroupSizeForced = false; |
| bool MaxGroupCountForced = false; |
| uint32_t MaxGroupSize = Device.getMaxGroupSize(); |
| const auto &Option = LevelZeroPluginTy::getOptions(); |
| const auto OptSubscRate = Option.SubscriptionRate; |
| auto &GroupCounts = KEnv.GroupCounts; |
| |
| uint32_t SIMDWidth = KernelPR.SIMDWidth; |
| uint32_t KernelWidth = KernelPR.Width; |
| uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize; |
| |
| if (KernelMaxThreadGroupSize < MaxGroupSize) { |
| MaxGroupSize = KernelMaxThreadGroupSize; |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Capping maximum team size to %" PRIu32 |
| " due to kernel constraints.\n", |
| MaxGroupSize); |
| } |
| |
| if (ThreadLimit > 0) { |
| MaxGroupSizeForced = true; |
| MaxGroupSize = ThreadLimit; |
| } |
| |
| uint32_t MaxGroupCount = 0; |
| if (NumTeams > 0) { |
| MaxGroupCount = NumTeams; |
| MaxGroupCountForced = true; |
| } |
| |
| if (MaxGroupCountForced) { |
| // If number of teams is specified by the user, then use KernelWidth. |
| // WIs per WG by default, so that it matches |
| // decideLoopKernelGroupArguments() behavior. |
| if (!MaxGroupSizeForced) { |
| MaxGroupSize = KernelWidth; |
| } |
| } else { |
| const uint32_t NumSubslices = Device.getNumSubslices(); |
| uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice(); |
| if (KEnv.HalfNumThreads) |
| NumThreadsPerSubslice /= 2; |
| |
| MaxGroupCount = NumSubslices * NumThreadsPerSubslice; |
| if (MaxGroupSizeForced) { |
| // Set group size for the HW capacity. |
| uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth; |
| uint32_t NumGroupsPerSubslice = |
| (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup; |
| MaxGroupCount = NumGroupsPerSubslice * NumSubslices; |
| } else { |
| assert(!MaxGroupSizeForced && !MaxGroupCountForced); |
| assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) && |
| "Invalid maxGroupSize"); |
| // Maximize group size. |
| while (MaxGroupSize >= KernelWidth) { |
| uint32_t NumThreadsPerGroup = |
| (MaxGroupSize + SIMDWidth - 1) / SIMDWidth; |
| |
| if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) { |
| uint32_t NumGroupsPerSubslice = |
| NumThreadsPerSubslice / NumThreadsPerGroup; |
| MaxGroupCount = NumGroupsPerSubslice * NumSubslices; |
| break; |
| } |
| MaxGroupSize -= KernelWidth; |
| } |
| } |
| } |
| |
| uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1}; |
| uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1}; |
| if (!MaxGroupCountForced) { |
| GRPCounts[0] *= OptSubscRate; |
| } |
| GroupCounts.groupCountX = GRPCounts[0]; |
| GroupCounts.groupCountY = GRPCounts[1]; |
| GroupCounts.groupCountZ = GRPCounts[2]; |
| std::copy(GRPSizes, GRPSizes + 3, GroupSizes); |
| } |
| |
| Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams, |
| int32_t ThreadLimit, uint32_t *GroupSizes, |
| L0LaunchEnvTy &KEnv) const { |
| |
| const auto DeviceId = Device.getDeviceId(); |
| const auto &KernelPR = getProperties(); |
| |
| // Read the most recent global thread limit and max teams. |
| const int32_t NumTeamsICV = 0; |
| const int32_t ThreadLimitICV = 0; |
| |
| bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG); |
| KEnv.HalfNumThreads = |
| LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG; |
| uint32_t KernelWidth = KernelPR.Width; |
| uint32_t SIMDWidth = KernelPR.SIMDWidth; |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth); |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth); |
| assert(SIMDWidth <= KernelWidth && "Invalid SIMD width."); |
| |
| if (ThreadLimit > 0) { |
| // use thread_limit clause value default. |
| DP("Max team size is set to %" PRId32 " (thread_limit clause)\n", |
| ThreadLimit); |
| } else if (ThreadLimitICV > 0) { |
| // else use thread-limit-var ICV. |
| ThreadLimit = ThreadLimitICV; |
| DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit); |
| } |
| |
| size_t MaxThreadLimit = Device.getMaxGroupSize(); |
| // Set correct max group size if the kernel was compiled with explicit SIMD. |
| if (SIMDWidth == 1) |
| MaxThreadLimit = Device.getNumThreadsPerSubslice(); |
| |
| if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) { |
| MaxThreadLimit = KernelPR.MaxThreadGroupSize; |
| DP("Capping maximum team size to %zu due to kernel constraints.\n", |
| MaxThreadLimit); |
| } |
| |
| if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) { |
| ThreadLimit = MaxThreadLimit; |
| DP("Max team size execceds current maximum %zu. Adjusted\n", |
| MaxThreadLimit); |
| } |
| // scope code to ease integration with downstream custom code. |
| { |
| if (NumTeams > 0) { |
| DP("Number of teams is set to %" PRId32 |
| "(num_teams clause or no teams construct)\n", |
| NumTeams); |
| } else if (NumTeamsICV > 0) { |
| // OMP_NUM_TEAMS only matters, if num_teams() clause is absent. |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV); |
| |
| NumTeams = NumTeamsICV; |
| DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n", |
| NumTeams); |
| } |
| |
| decideKernelGroupArguments(Device, (uint32_t)NumTeams, |
| (uint32_t)ThreadLimit, GroupSizes, KEnv); |
| } |
| |
| return Plugin::success(); |
| } |
| |
| static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device, |
| ze_kernel_handle_t zeKernel, |
| L0LaunchEnvTy &KEnv, |
| CommandModeTy CommandMode) { |
| const auto DeviceId = l0Device.getDeviceId(); |
| auto *IdStr = l0Device.getZeIdCStr(); |
| auto CmdListOrErr = l0Device.getImmCmdList(); |
| if (!CmdListOrErr) |
| return CmdListOrErr.takeError(); |
| const ze_command_list_handle_t CmdList = *CmdListOrErr; |
| // Command queue is not used with immediate command list. |
| |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Using immediate command list for kernel submission.\n"); |
| auto EventOrError = l0Device.getEvent(); |
| if (!EventOrError) |
| return EventOrError.takeError(); |
| ze_event_handle_t Event = *EventOrError; |
| size_t NumWaitEvents = 0; |
| ze_event_handle_t *WaitEvents = nullptr; |
| auto *AsyncQueue = KEnv.AsyncQueue; |
| if (KEnv.IsAsync && !AsyncQueue->WaitEvents.empty()) { |
| if (CommandMode == CommandModeTy::AsyncOrdered) { |
| NumWaitEvents = 1; |
| WaitEvents = &AsyncQueue->WaitEvents.back(); |
| } else { |
| NumWaitEvents = AsyncQueue->WaitEvents.size(); |
| WaitEvents = AsyncQueue->WaitEvents.data(); |
| } |
| } |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Kernel depends on %zu data copying events.\n", NumWaitEvents); |
| CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel, |
| &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents); |
| KEnv.KernelPR.Mtx.unlock(); |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr); |
| |
| if (KEnv.IsAsync) { |
| AsyncQueue->WaitEvents.push_back(Event); |
| AsyncQueue->KernelEvent = Event; |
| } else { |
| CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout); |
| if (auto Err = l0Device.releaseEvent(Event)) |
| return Err; |
| } |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel), |
| IdStr); |
| |
| return Plugin::success(); |
| } |
| |
| static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device, |
| ze_kernel_handle_t zeKernel, |
| L0LaunchEnvTy &KEnv) { |
| const auto DeviceId = l0Device.getDeviceId(); |
| const auto *IdStr = l0Device.getZeIdCStr(); |
| |
| auto CmdListOrErr = l0Device.getCmdList(); |
| if (!CmdListOrErr) |
| return CmdListOrErr.takeError(); |
| ze_command_list_handle_t CmdList = *CmdListOrErr; |
| auto CmdQueueOrErr = l0Device.getCmdQueue(); |
| if (!CmdQueueOrErr) |
| return CmdQueueOrErr.takeError(); |
| const ze_command_queue_handle_t CmdQueue = *CmdQueueOrErr; |
| |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Using regular command list for kernel submission.\n"); |
| |
| ze_event_handle_t Event = nullptr; |
| CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel, |
| &KEnv.GroupCounts, Event, 0, nullptr); |
| KEnv.KernelPR.Mtx.unlock(); |
| CALL_ZE_RET_ERROR(zeCommandListClose, CmdList); |
| CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(), |
| CmdQueue, 1, &CmdList, nullptr); |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr); |
| CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout); |
| CALL_ZE_RET_ERROR(zeCommandListReset, CmdList); |
| if (Event) { |
| if (auto Err = l0Device.releaseEvent(Event)) |
| return Err; |
| } |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel), |
| IdStr); |
| |
| return Plugin::success(); |
| } |
| |
| Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv, |
| uint32_t NumThreads[3], |
| uint32_t NumBlocks[3]) const { |
| |
| if (KernelEnvironment.Configuration.ExecMode != OMP_TGT_EXEC_MODE_BARE) { |
| // For non-bare mode, the groups are already set in the launch. |
| KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]}; |
| CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), NumThreads[0], |
| NumThreads[1], NumThreads[2]); |
| return Plugin::success(); |
| } |
| |
| int32_t NumTeams = NumBlocks[0]; |
| int32_t ThreadLimit = NumThreads[0]; |
| if (NumTeams < 0) |
| NumTeams = 0; |
| if (ThreadLimit < 0) |
| ThreadLimit = 0; |
| |
| uint32_t GroupSizes[3]; |
| auto DeviceId = l0Device.getDeviceId(); |
| auto &KernelPR = KEnv.KernelPR; |
| // Check if we can reuse previous group parameters. |
| bool GroupParamsReused = |
| KernelPR.reuseGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv); |
| |
| if (!GroupParamsReused) { |
| if (auto Err = |
| getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes, KEnv)) |
| return Err; |
| KernelPR.cacheGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv); |
| } |
| |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0], |
| GroupSizes[1], GroupSizes[2]); |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", |
| KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY, |
| KEnv.GroupCounts.groupCountZ); |
| |
| if (!GroupParamsReused) { |
| CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0], |
| GroupSizes[1], GroupSizes[2]); |
| } |
| |
| return Plugin::success(); |
| } |
| |
| Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device, |
| L0LaunchEnvTy &KEnv) const { |
| // Set Kernel Indirect flags. |
| ze_kernel_indirect_access_flags_t Flags = 0; |
| Flags |= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags(); |
| Flags |= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags(); |
| |
| if (KEnv.KernelPR.IndirectAccessFlags != Flags) { |
| // Combine with common access flags. |
| const auto FinalFlags = l0Device.getIndirectFlags() | Flags; |
| CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, zeKernel, FinalFlags); |
| DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags)); |
| KEnv.KernelPR.IndirectAccessFlags = Flags; |
| } |
| |
| return Plugin::success(); |
| } |
| |
| Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice, |
| uint32_t NumThreads[3], uint32_t NumBlocks[3], |
| KernelArgsTy &KernelArgs, |
| KernelLaunchParamsTy LaunchParams, |
| AsyncInfoWrapperTy &AsyncInfoWrapper) const { |
| auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice); |
| __tgt_async_info *AsyncInfo = AsyncInfoWrapper; |
| |
| auto zeKernel = getZeKernel(); |
| auto DeviceId = l0Device.getDeviceId(); |
| int32_t NumArgs = KernelArgs.NumArgs; |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n", |
| DPxPTR(zeKernel)); |
| |
| auto &Plugin = l0Device.getPlugin(); |
| auto *IdStr = l0Device.getZeIdCStr(); |
| auto &Options = LevelZeroPluginTy::getOptions(); |
| bool IsAsync = AsyncInfo && l0Device.asyncEnabled(); |
| if (IsAsync && !AsyncInfo->Queue) { |
| AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue()); |
| if (!AsyncInfo->Queue) |
| IsAsync = false; // Couldn't get a queue, revert to sync. |
| } |
| auto *AsyncQueue = |
| IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr; |
| auto &KernelPR = getProperties(); |
| |
| L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR); |
| |
| // Protect from kernel preparation to submission as kernels are shared. |
| KernelPR.Mtx.lock(); |
| |
| if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks)) |
| return Err; |
| |
| // Set kernel arguments. |
| for (int32_t I = 0; I < NumArgs; I++) { |
| // Scope code to ease integration with downstream custom code. |
| { |
| void *Arg = (static_cast<void **>(LaunchParams.Data))[I]; |
| CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg), |
| Arg == nullptr ? nullptr : &Arg); |
| INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, |
| "Kernel Pointer argument %" PRId32 " (value: " DPxMOD |
| ") was set successfully for device %s.\n", |
| I, DPxPTR(Arg), IdStr); |
| } |
| } |
| |
| if (auto Err = setIndirectFlags(l0Device, KEnv)) |
| return Err; |
| |
| // The next calls should unlock the KernelLock internally. |
| const bool UseImmCmdList = l0Device.useImmForCompute(); |
| if (UseImmCmdList) |
| return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv, |
| Options.CommandMode); |
| |
| return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv); |
| } |
| |
| } // namespace llvm::omp::target::plugin |