offload/plugins-nextgen/level_zero/src/L0Kernel.cpp - llvm-project.git - Git at Google

 //===--- Level Zero Target RTL Implementation -----------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // GenericKernel implementation for SPIR-V/Xe machine.
 //
 //===----------------------------------------------------------------------===//

 #include "L0Kernel.h"
 #include "L0Device.h"
 #include "L0Plugin.h"
 #include "L0Program.h"

 namespace llvm::omp::target::plugin {

 bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn,
                                           const int32_t ThreadLimitIn,
                                           uint32_t *GroupSizesOut,
                                           L0LaunchEnvTy &KEnv) const {
   if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
     return false;
   // Found matching input parameters.
   std::copy_n(GroupSizes, 3, GroupSizesOut);
   KEnv.GroupCounts = GroupCounts;
   return true;
 }

 void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn,
                                           const int32_t ThreadLimitIn,
                                           const uint32_t *GroupSizesIn,
                                           L0LaunchEnvTy &KEnv) {
   NumTeams = NumTeamsIn;
   ThreadLimit = ThreadLimitIn;
   std::copy_n(GroupSizesIn, 3, GroupSizes);
   GroupCounts = KEnv.GroupCounts;
 }

 Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) {
   const auto &l0Device = L0DeviceTy::makeL0Device(Program.getDevice());
   auto &KernelPR = getProperties();
   ze_kernel_properties_t KP = {};
   KP.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES;
   KP.pNext = nullptr;
   ze_kernel_preferred_group_size_properties_t KPrefGRPSize = {};
   KPrefGRPSize.stype = ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES;
   KPrefGRPSize.pNext = nullptr;
   if (l0Device.getDriverAPIVersion() >= ZE_API_VERSION_1_2)
     KP.pNext = &KPrefGRPSize;

   CALL_ZE_RET_ERROR(zeKernelGetProperties, zeKernel, &KP);
   KernelPR.SIMDWidth = KP.maxSubgroupSize;
   KernelPR.Width = KP.maxSubgroupSize;

   if (KP.pNext)
     KernelPR.Width = KPrefGRPSize.preferredMultiple;

   if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen)) {
     KernelPR.Width = (std::max)(KernelPR.Width, 2 * KernelPR.SIMDWidth);
   }
   KernelPR.MaxThreadGroupSize = KP.maxSubgroupSize * KP.maxNumSubgroups;
   return Plugin::success();
 }

 Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
   const auto *KernelName = getName();

   auto Module = Program.findModuleFromKernelName(KernelName);
   ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
                                  KernelName};
   CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
   if (auto Err = readKernelProperties(Program))
     return Err;

   return Plugin::success();
 }

 Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
                            DeviceImageTy &Image) {
   auto &Program = L0ProgramTy::makeL0Program(Image);

   if (auto Err = buildKernel(Program))
     return Err;
   Program.addKernel(this);

   return Plugin::success();
 }

 void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device,
                                             uint32_t NumTeams,
                                             uint32_t ThreadLimit,
                                             uint32_t *GroupSizes,
                                             L0LaunchEnvTy &KEnv) const {

   const KernelPropertiesTy &KernelPR = getProperties();

   const auto DeviceId = Device.getDeviceId();
   bool MaxGroupSizeForced = false;
   bool MaxGroupCountForced = false;
   uint32_t MaxGroupSize = Device.getMaxGroupSize();
   const auto &Option = LevelZeroPluginTy::getOptions();
   const auto OptSubscRate = Option.SubscriptionRate;
   auto &GroupCounts = KEnv.GroupCounts;

   uint32_t SIMDWidth = KernelPR.SIMDWidth;
   uint32_t KernelWidth = KernelPR.Width;
   uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;

   if (KernelMaxThreadGroupSize < MaxGroupSize) {
     MaxGroupSize = KernelMaxThreadGroupSize;
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
          "Capping maximum team size to %" PRIu32
          " due to kernel constraints.\n",
          MaxGroupSize);
   }

   if (ThreadLimit > 0) {
     MaxGroupSizeForced = true;
     MaxGroupSize = ThreadLimit;
   }

   uint32_t MaxGroupCount = 0;
   if (NumTeams > 0) {
     MaxGroupCount = NumTeams;
     MaxGroupCountForced = true;
   }

   if (MaxGroupCountForced) {
     // If number of teams is specified by the user, then use KernelWidth.
     // WIs per WG by default, so that it matches
     // decideLoopKernelGroupArguments() behavior.
     if (!MaxGroupSizeForced) {
       MaxGroupSize = KernelWidth;
     }
   } else {
     const uint32_t NumSubslices = Device.getNumSubslices();
     uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
     if (KEnv.HalfNumThreads)
       NumThreadsPerSubslice /= 2;

     MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
     if (MaxGroupSizeForced) {
       // Set group size for the HW capacity.
       uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
       uint32_t NumGroupsPerSubslice =
           (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
       MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
     } else {
       assert(!MaxGroupSizeForced && !MaxGroupCountForced);
       assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
              "Invalid maxGroupSize");
       // Maximize group size.
       while (MaxGroupSize >= KernelWidth) {
         uint32_t NumThreadsPerGroup =
             (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;

         if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
           uint32_t NumGroupsPerSubslice =
               NumThreadsPerSubslice / NumThreadsPerGroup;
           MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
           break;
         }
         MaxGroupSize -= KernelWidth;
       }
     }
   }

   uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
   uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
   if (!MaxGroupCountForced) {
     GRPCounts[0] *= OptSubscRate;
   }
   GroupCounts.groupCountX = GRPCounts[0];
   GroupCounts.groupCountY = GRPCounts[1];
   GroupCounts.groupCountZ = GRPCounts[2];
   std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
 }

 Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
                                  int32_t ThreadLimit, uint32_t *GroupSizes,
                                  L0LaunchEnvTy &KEnv) const {

   const auto DeviceId = Device.getDeviceId();
   const auto &KernelPR = getProperties();

   // Read the most recent global thread limit and max teams.
   const int32_t NumTeamsICV = 0;
   const int32_t ThreadLimitICV = 0;

   bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
   KEnv.HalfNumThreads =
       LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG;
   uint32_t KernelWidth = KernelPR.Width;
   uint32_t SIMDWidth = KernelPR.SIMDWidth;
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
   assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");

   if (ThreadLimit > 0) {
     // use thread_limit clause value default.
     DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
        ThreadLimit);
   } else if (ThreadLimitICV > 0) {
     // else use thread-limit-var ICV.
     ThreadLimit = ThreadLimitICV;
     DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
   }

   size_t MaxThreadLimit = Device.getMaxGroupSize();
   // Set correct max group size if the kernel was compiled with explicit SIMD.
   if (SIMDWidth == 1)
     MaxThreadLimit = Device.getNumThreadsPerSubslice();

   if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
     MaxThreadLimit = KernelPR.MaxThreadGroupSize;
     DP("Capping maximum team size to %zu due to kernel constraints.\n",
        MaxThreadLimit);
   }

   if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
     ThreadLimit = MaxThreadLimit;
     DP("Max team size execceds current maximum %zu. Adjusted\n",
        MaxThreadLimit);
   }
   // scope code to ease integration with downstream custom code.
   {
     if (NumTeams > 0) {
       DP("Number of teams is set to %" PRId32
          "(num_teams clause or no teams construct)\n",
          NumTeams);
     } else if (NumTeamsICV > 0) {
       // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
            "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);

       NumTeams = NumTeamsICV;
       DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
          NumTeams);
     }

     decideKernelGroupArguments(Device, (uint32_t)NumTeams,
                                (uint32_t)ThreadLimit, GroupSizes, KEnv);
   }

   return Plugin::success();
 }

 static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
                                         ze_kernel_handle_t zeKernel,
                                         L0LaunchEnvTy &KEnv,
                                         CommandModeTy CommandMode) {
   const auto DeviceId = l0Device.getDeviceId();
   auto *IdStr = l0Device.getZeIdCStr();
   auto CmdListOrErr = l0Device.getImmCmdList();
   if (!CmdListOrErr)
     return CmdListOrErr.takeError();
   const ze_command_list_handle_t CmdList = *CmdListOrErr;
   // Command queue is not used with immediate command list.

   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Using immediate command list for kernel submission.\n");
   auto EventOrError = l0Device.getEvent();
   if (!EventOrError)
     return EventOrError.takeError();
   ze_event_handle_t Event = *EventOrError;
   size_t NumWaitEvents = 0;
   ze_event_handle_t *WaitEvents = nullptr;
   auto *AsyncQueue = KEnv.AsyncQueue;
   if (KEnv.IsAsync && !AsyncQueue->WaitEvents.empty()) {
     if (CommandMode == CommandModeTy::AsyncOrdered) {
       NumWaitEvents = 1;
       WaitEvents = &AsyncQueue->WaitEvents.back();
     } else {
       NumWaitEvents = AsyncQueue->WaitEvents.size();
       WaitEvents = AsyncQueue->WaitEvents.data();
     }
   }
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Kernel depends on %zu data copying events.\n", NumWaitEvents);
   CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
                     &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
   KEnv.KernelPR.Mtx.unlock();
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);

   if (KEnv.IsAsync) {
     AsyncQueue->WaitEvents.push_back(Event);
     AsyncQueue->KernelEvent = Event;
   } else {
     CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
     if (auto Err = l0Device.releaseEvent(Event))
       return Err;
   }
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
        IdStr);

   return Plugin::success();
 }

 static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
                                       ze_kernel_handle_t zeKernel,
                                       L0LaunchEnvTy &KEnv) {
   const auto DeviceId = l0Device.getDeviceId();
   const auto *IdStr = l0Device.getZeIdCStr();

   auto CmdListOrErr = l0Device.getCmdList();
   if (!CmdListOrErr)
     return CmdListOrErr.takeError();
   ze_command_list_handle_t CmdList = *CmdListOrErr;
   auto CmdQueueOrErr = l0Device.getCmdQueue();
   if (!CmdQueueOrErr)
     return CmdQueueOrErr.takeError();
   const ze_command_queue_handle_t CmdQueue = *CmdQueueOrErr;

   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Using regular command list for kernel submission.\n");

   ze_event_handle_t Event = nullptr;
   CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
                     &KEnv.GroupCounts, Event, 0, nullptr);
   KEnv.KernelPR.Mtx.unlock();
   CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
   CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
                         CmdQueue, 1, &CmdList, nullptr);
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
   CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
   CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
   if (Event) {
     if (auto Err = l0Device.releaseEvent(Event))
       return Err;
   }
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
        IdStr);

   return Plugin::success();
 }

 Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
                                   uint32_t NumThreads[3],
                                   uint32_t NumBlocks[3]) const {

   if (KernelEnvironment.Configuration.ExecMode != OMP_TGT_EXEC_MODE_BARE) {
     // For non-bare mode, the groups are already set in the launch.
     KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]};
     CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), NumThreads[0],
                       NumThreads[1], NumThreads[2]);
     return Plugin::success();
   }

   int32_t NumTeams = NumBlocks[0];
   int32_t ThreadLimit = NumThreads[0];
   if (NumTeams < 0)
     NumTeams = 0;
   if (ThreadLimit < 0)
     ThreadLimit = 0;

   uint32_t GroupSizes[3];
   auto DeviceId = l0Device.getDeviceId();
   auto &KernelPR = KEnv.KernelPR;
   // Check if we can reuse previous group parameters.
   bool GroupParamsReused =
       KernelPR.reuseGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);

   if (!GroupParamsReused) {
     if (auto Err =
             getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes, KEnv))
       return Err;
     KernelPR.cacheGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);
   }

   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
        GroupSizes[1], GroupSizes[2]);
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
        KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY,
        KEnv.GroupCounts.groupCountZ);

   if (!GroupParamsReused) {
     CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0],
                       GroupSizes[1], GroupSizes[2]);
   }

   return Plugin::success();
 }

 Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
                                    L0LaunchEnvTy &KEnv) const {
   // Set Kernel Indirect flags.
   ze_kernel_indirect_access_flags_t Flags = 0;
   Flags |= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
   Flags |= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();

   if (KEnv.KernelPR.IndirectAccessFlags != Flags) {
     // Combine with common access flags.
     const auto FinalFlags = l0Device.getIndirectFlags() | Flags;
     CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, zeKernel, FinalFlags);
     DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
     KEnv.KernelPR.IndirectAccessFlags = Flags;
   }

   return Plugin::success();
 }

 Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                              uint32_t NumThreads[3], uint32_t NumBlocks[3],
                              KernelArgsTy &KernelArgs,
                              KernelLaunchParamsTy LaunchParams,
                              AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
   __tgt_async_info *AsyncInfo = AsyncInfoWrapper;

   auto zeKernel = getZeKernel();
   auto DeviceId = l0Device.getDeviceId();
   int32_t NumArgs = KernelArgs.NumArgs;
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n",
        DPxPTR(zeKernel));

   auto &Plugin = l0Device.getPlugin();
   auto *IdStr = l0Device.getZeIdCStr();
   auto &Options = LevelZeroPluginTy::getOptions();
   bool IsAsync = AsyncInfo && l0Device.asyncEnabled();
   if (IsAsync && !AsyncInfo->Queue) {
     AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
     if (!AsyncInfo->Queue)
       IsAsync = false; // Couldn't get a queue, revert to sync.
   }
   auto *AsyncQueue =
       IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
   auto &KernelPR = getProperties();

   L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);

   // Protect from kernel preparation to submission as kernels are shared.
   KernelPR.Mtx.lock();

   if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
     return Err;

   // Set kernel arguments.
   for (int32_t I = 0; I < NumArgs; I++) {
     // Scope code to ease integration with downstream custom code.
     {
       void *Arg = (static_cast<void **>(LaunchParams.Data))[I];
       CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
                         Arg == nullptr ? nullptr : &Arg);
       INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
            "Kernel Pointer argument %" PRId32 " (value: " DPxMOD
            ") was set successfully for device %s.\n",
            I, DPxPTR(Arg), IdStr);
     }
   }

   if (auto Err = setIndirectFlags(l0Device, KEnv))
     return Err;

   // The next calls should unlock the KernelLock internally.
   const bool UseImmCmdList = l0Device.useImmForCompute();
   if (UseImmCmdList)
     return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv,
                                       Options.CommandMode);

   return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv);
 }

 } // namespace llvm::omp::target::plugin
	//===--- Level Zero Target RTL Implementation -----------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// GenericKernel implementation for SPIR-V/Xe machine.
	//
	//===----------------------------------------------------------------------===//

	#include "L0Kernel.h"
	#include "L0Device.h"
	#include "L0Plugin.h"
	#include "L0Program.h"

	namespace llvm::omp::target::plugin {

	bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn,
	const int32_t ThreadLimitIn,
	uint32_t *GroupSizesOut,
	L0LaunchEnvTy &KEnv) const {
	if (NumTeamsIn != NumTeams \|\| ThreadLimitIn != ThreadLimit)
	return false;
	// Found matching input parameters.
	std::copy_n(GroupSizes, 3, GroupSizesOut);
	KEnv.GroupCounts = GroupCounts;
	return true;
	}

	void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn,
	const int32_t ThreadLimitIn,
	const uint32_t *GroupSizesIn,
	L0LaunchEnvTy &KEnv) {
	NumTeams = NumTeamsIn;
	ThreadLimit = ThreadLimitIn;
	std::copy_n(GroupSizesIn, 3, GroupSizes);
	GroupCounts = KEnv.GroupCounts;
	}

	Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) {
	const auto &l0Device = L0DeviceTy::makeL0Device(Program.getDevice());
	auto &KernelPR = getProperties();
	ze_kernel_properties_t KP = {};
	KP.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES;
	KP.pNext = nullptr;
	ze_kernel_preferred_group_size_properties_t KPrefGRPSize = {};
	KPrefGRPSize.stype = ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES;
	KPrefGRPSize.pNext = nullptr;
	if (l0Device.getDriverAPIVersion() >= ZE_API_VERSION_1_2)
	KP.pNext = &KPrefGRPSize;

	CALL_ZE_RET_ERROR(zeKernelGetProperties, zeKernel, &KP);
	KernelPR.SIMDWidth = KP.maxSubgroupSize;
	KernelPR.Width = KP.maxSubgroupSize;

	if (KP.pNext)
	KernelPR.Width = KPrefGRPSize.preferredMultiple;

	if (!l0Device.isDeviceArch(DeviceArchTy::DeviceArch_Gen)) {
	KernelPR.Width = (std::max)(KernelPR.Width, 2 * KernelPR.SIMDWidth);
	}
	KernelPR.MaxThreadGroupSize = KP.maxSubgroupSize * KP.maxNumSubgroups;
	return Plugin::success();
	}

	Error L0KernelTy::buildKernel(L0ProgramTy &Program) {
	const auto *KernelName = getName();

	auto Module = Program.findModuleFromKernelName(KernelName);
	ze_kernel_desc_t KernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0,
	KernelName};
	CALL_ZE_RET_ERROR(zeKernelCreate, Module, &KernelDesc, &zeKernel);
	if (auto Err = readKernelProperties(Program))
	return Err;

	return Plugin::success();
	}

	Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
	DeviceImageTy &Image) {
	auto &Program = L0ProgramTy::makeL0Program(Image);

	if (auto Err = buildKernel(Program))
	return Err;
	Program.addKernel(this);

	return Plugin::success();
	}

	void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device,
	uint32_t NumTeams,
	uint32_t ThreadLimit,
	uint32_t *GroupSizes,
	L0LaunchEnvTy &KEnv) const {

	const KernelPropertiesTy &KernelPR = getProperties();

	const auto DeviceId = Device.getDeviceId();
	bool MaxGroupSizeForced = false;
	bool MaxGroupCountForced = false;
	uint32_t MaxGroupSize = Device.getMaxGroupSize();
	const auto &Option = LevelZeroPluginTy::getOptions();
	const auto OptSubscRate = Option.SubscriptionRate;
	auto &GroupCounts = KEnv.GroupCounts;

	uint32_t SIMDWidth = KernelPR.SIMDWidth;
	uint32_t KernelWidth = KernelPR.Width;
	uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;

	if (KernelMaxThreadGroupSize < MaxGroupSize) {
	MaxGroupSize = KernelMaxThreadGroupSize;
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Capping maximum team size to %" PRIu32
	" due to kernel constraints.\n",
	MaxGroupSize);
	}

	if (ThreadLimit > 0) {
	MaxGroupSizeForced = true;
	MaxGroupSize = ThreadLimit;
	}

	uint32_t MaxGroupCount = 0;
	if (NumTeams > 0) {
	MaxGroupCount = NumTeams;
	MaxGroupCountForced = true;
	}

	if (MaxGroupCountForced) {
	// If number of teams is specified by the user, then use KernelWidth.
	// WIs per WG by default, so that it matches
	// decideLoopKernelGroupArguments() behavior.
	if (!MaxGroupSizeForced) {
	MaxGroupSize = KernelWidth;
	}
	} else {
	const uint32_t NumSubslices = Device.getNumSubslices();
	uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
	if (KEnv.HalfNumThreads)
	NumThreadsPerSubslice /= 2;

	MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
	if (MaxGroupSizeForced) {
	// Set group size for the HW capacity.
	uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
	uint32_t NumGroupsPerSubslice =
	(NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
	MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
	} else {
	assert(!MaxGroupSizeForced && !MaxGroupCountForced);
	assert((MaxGroupSize <= KernelWidth \|\| MaxGroupSize % KernelWidth == 0) &&
	"Invalid maxGroupSize");
	// Maximize group size.
	while (MaxGroupSize >= KernelWidth) {
	uint32_t NumThreadsPerGroup =
	(MaxGroupSize + SIMDWidth - 1) / SIMDWidth;

	if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
	uint32_t NumGroupsPerSubslice =
	NumThreadsPerSubslice / NumThreadsPerGroup;
	MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
	break;
	}
	MaxGroupSize -= KernelWidth;
	}
	}
	}

	uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
	uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
	if (!MaxGroupCountForced) {
	GRPCounts[0] *= OptSubscRate;
	}
	GroupCounts.groupCountX = GRPCounts[0];
	GroupCounts.groupCountY = GRPCounts[1];
	GroupCounts.groupCountZ = GRPCounts[2];
	std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
	}

	Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
	int32_t ThreadLimit, uint32_t *GroupSizes,
	L0LaunchEnvTy &KEnv) const {

	const auto DeviceId = Device.getDeviceId();
	const auto &KernelPR = getProperties();

	// Read the most recent global thread limit and max teams.
	const int32_t NumTeamsICV = 0;
	const int32_t ThreadLimitICV = 0;

	bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
	KEnv.HalfNumThreads =
	LevelZeroPluginTy::getOptions().ZeDebugEnabled && IsXeHPG;
	uint32_t KernelWidth = KernelPR.Width;
	uint32_t SIMDWidth = KernelPR.SIMDWidth;
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
	assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");

	if (ThreadLimit > 0) {
	// use thread_limit clause value default.
	DP("Max team size is set to %" PRId32 " (thread_limit clause)\n",
	ThreadLimit);
	} else if (ThreadLimitICV > 0) {
	// else use thread-limit-var ICV.
	ThreadLimit = ThreadLimitICV;
	DP("Max team size is set to %" PRId32 " (thread-limit-icv)\n", ThreadLimit);
	}

	size_t MaxThreadLimit = Device.getMaxGroupSize();
	// Set correct max group size if the kernel was compiled with explicit SIMD.
	if (SIMDWidth == 1)
	MaxThreadLimit = Device.getNumThreadsPerSubslice();

	if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
	MaxThreadLimit = KernelPR.MaxThreadGroupSize;
	DP("Capping maximum team size to %zu due to kernel constraints.\n",
	MaxThreadLimit);
	}

	if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
	ThreadLimit = MaxThreadLimit;
	DP("Max team size execceds current maximum %zu. Adjusted\n",
	MaxThreadLimit);
	}
	// scope code to ease integration with downstream custom code.
	{
	if (NumTeams > 0) {
	DP("Number of teams is set to %" PRId32
	"(num_teams clause or no teams construct)\n",
	NumTeams);
	} else if (NumTeamsICV > 0) {
	// OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);

	NumTeams = NumTeamsICV;
	DP("Max number of teams is set to %" PRId32 " (OMP_NUM_TEAMS)\n",
	NumTeams);
	}

	decideKernelGroupArguments(Device, (uint32_t)NumTeams,
	(uint32_t)ThreadLimit, GroupSizes, KEnv);
	}

	return Plugin::success();
	}

	static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
	ze_kernel_handle_t zeKernel,
	L0LaunchEnvTy &KEnv,
	CommandModeTy CommandMode) {
	const auto DeviceId = l0Device.getDeviceId();
	auto *IdStr = l0Device.getZeIdCStr();
	auto CmdListOrErr = l0Device.getImmCmdList();
	if (!CmdListOrErr)
	return CmdListOrErr.takeError();
	const ze_command_list_handle_t CmdList = *CmdListOrErr;
	// Command queue is not used with immediate command list.

	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Using immediate command list for kernel submission.\n");
	auto EventOrError = l0Device.getEvent();
	if (!EventOrError)
	return EventOrError.takeError();
	ze_event_handle_t Event = *EventOrError;
	size_t NumWaitEvents = 0;
	ze_event_handle_t *WaitEvents = nullptr;
	auto *AsyncQueue = KEnv.AsyncQueue;
	if (KEnv.IsAsync && !AsyncQueue->WaitEvents.empty()) {
	if (CommandMode == CommandModeTy::AsyncOrdered) {
	NumWaitEvents = 1;
	WaitEvents = &AsyncQueue->WaitEvents.back();
	} else {
	NumWaitEvents = AsyncQueue->WaitEvents.size();
	WaitEvents = AsyncQueue->WaitEvents.data();
	}
	}
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Kernel depends on %zu data copying events.\n", NumWaitEvents);
	CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
	&KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
	KEnv.KernelPR.Mtx.unlock();
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);

	if (KEnv.IsAsync) {
	AsyncQueue->WaitEvents.push_back(Event);
	AsyncQueue->KernelEvent = Event;
	} else {
	CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
	if (auto Err = l0Device.releaseEvent(Event))
	return Err;
	}
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
	IdStr);

	return Plugin::success();
	}

	static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
	ze_kernel_handle_t zeKernel,
	L0LaunchEnvTy &KEnv) {
	const auto DeviceId = l0Device.getDeviceId();
	const auto *IdStr = l0Device.getZeIdCStr();

	auto CmdListOrErr = l0Device.getCmdList();
	if (!CmdListOrErr)
	return CmdListOrErr.takeError();
	ze_command_list_handle_t CmdList = *CmdListOrErr;
	auto CmdQueueOrErr = l0Device.getCmdQueue();
	if (!CmdQueueOrErr)
	return CmdQueueOrErr.takeError();
	const ze_command_queue_handle_t CmdQueue = *CmdQueueOrErr;

	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Using regular command list for kernel submission.\n");

	ze_event_handle_t Event = nullptr;
	CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
	&KEnv.GroupCounts, Event, 0, nullptr);
	KEnv.KernelPR.Mtx.unlock();
	CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
	CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
	CmdQueue, 1, &CmdList, nullptr);
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
	CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
	CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
	if (Event) {
	if (auto Err = l0Device.releaseEvent(Event))
	return Err;
	}
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
	IdStr);

	return Plugin::success();
	}

	Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
	uint32_t NumThreads[3],
	uint32_t NumBlocks[3]) const {

	if (KernelEnvironment.Configuration.ExecMode != OMP_TGT_EXEC_MODE_BARE) {
	// For non-bare mode, the groups are already set in the launch.
	KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]};
	CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), NumThreads[0],
	NumThreads[1], NumThreads[2]);
	return Plugin::success();
	}

	int32_t NumTeams = NumBlocks[0];
	int32_t ThreadLimit = NumThreads[0];
	if (NumTeams < 0)
	NumTeams = 0;
	if (ThreadLimit < 0)
	ThreadLimit = 0;

	uint32_t GroupSizes[3];
	auto DeviceId = l0Device.getDeviceId();
	auto &KernelPR = KEnv.KernelPR;
	// Check if we can reuse previous group parameters.
	bool GroupParamsReused =
	KernelPR.reuseGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);

	if (!GroupParamsReused) {
	if (auto Err =
	getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes, KEnv))
	return Err;
	KernelPR.cacheGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);
	}

	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Team sizes = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n", GroupSizes[0],
	GroupSizes[1], GroupSizes[2]);
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Number of teams = {%" PRIu32 ", %" PRIu32 ", %" PRIu32 "}\n",
	KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY,
	KEnv.GroupCounts.groupCountZ);

	if (!GroupParamsReused) {
	CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0],
	GroupSizes[1], GroupSizes[2]);
	}

	return Plugin::success();
	}

	Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
	L0LaunchEnvTy &KEnv) const {
	// Set Kernel Indirect flags.
	ze_kernel_indirect_access_flags_t Flags = 0;
	Flags \|= l0Device.getMemAllocator(TARGET_ALLOC_HOST).getIndirectFlags();
	Flags \|= l0Device.getMemAllocator(TARGET_ALLOC_DEVICE).getIndirectFlags();

	if (KEnv.KernelPR.IndirectAccessFlags != Flags) {
	// Combine with common access flags.
	const auto FinalFlags = l0Device.getIndirectFlags() \| Flags;
	CALL_ZE_RET_ERROR(zeKernelSetIndirectAccess, zeKernel, FinalFlags);
	DP("Setting indirect access flags " DPxMOD "\n", DPxPTR(FinalFlags));
	KEnv.KernelPR.IndirectAccessFlags = Flags;
	}

	return Plugin::success();
	}

	Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
	uint32_t NumThreads[3], uint32_t NumBlocks[3],
	KernelArgsTy &KernelArgs,
	KernelLaunchParamsTy LaunchParams,
	AsyncInfoWrapperTy &AsyncInfoWrapper) const {
	auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
	__tgt_async_info *AsyncInfo = AsyncInfoWrapper;

	auto zeKernel = getZeKernel();
	auto DeviceId = l0Device.getDeviceId();
	int32_t NumArgs = KernelArgs.NumArgs;
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel " DPxMOD "...\n",
	DPxPTR(zeKernel));

	auto &Plugin = l0Device.getPlugin();
	auto *IdStr = l0Device.getZeIdCStr();
	auto &Options = LevelZeroPluginTy::getOptions();
	bool IsAsync = AsyncInfo && l0Device.asyncEnabled();
	if (IsAsync && !AsyncInfo->Queue) {
	AsyncInfo->Queue = reinterpret_cast<void *>(Plugin.getAsyncQueue());
	if (!AsyncInfo->Queue)
	IsAsync = false; // Couldn't get a queue, revert to sync.
	}
	auto *AsyncQueue =
	IsAsync ? static_cast<AsyncQueueTy *>(AsyncInfo->Queue) : nullptr;
	auto &KernelPR = getProperties();

	L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);

	// Protect from kernel preparation to submission as kernels are shared.
	KernelPR.Mtx.lock();

	if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
	return Err;

	// Set kernel arguments.
	for (int32_t I = 0; I < NumArgs; I++) {
	// Scope code to ease integration with downstream custom code.
	{
	void Arg = (static_cast<void *>(LaunchParams.Data))[I];
	CALL_ZE_RET_ERROR(zeKernelSetArgumentValue, zeKernel, I, sizeof(Arg),
	Arg == nullptr ? nullptr : &Arg);
	INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
	"Kernel Pointer argument %" PRId32 " (value: " DPxMOD
	") was set successfully for device %s.\n",
	I, DPxPTR(Arg), IdStr);
	}
	}

	if (auto Err = setIndirectFlags(l0Device, KEnv))
	return Err;

	// The next calls should unlock the KernelLock internally.
	const bool UseImmCmdList = l0Device.useImmForCompute();
	if (UseImmCmdList)
	return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv,
	Options.CommandMode);

	return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv);
	}

	} // namespace llvm::omp::target::plugin