offload/plugins-nextgen/level_zero/include/L0Options.h - llvm-project.git - Git at Google

 //===--- Level Zero Target RTL Implementation -----------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Level Zero RTL Options support.
 //
 //===----------------------------------------------------------------------===//

 #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
 #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H

 #include <level_zero/ze_api.h>

 #include "Shared/EnvironmentVar.h"

 #include "L0Defs.h"

 namespace llvm::omp::target::plugin {
 /// Command submission mode.
 enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };

 /// Specialization constants used for a module compilation.
 class SpecConstantsTy {
   std::vector<uint32_t> ConstantIds;
   std::vector<const void *> ConstantValues;
   BumpPtrAllocator &Allocator;

 public:
   SpecConstantsTy(BumpPtrAllocator &Allocator) : Allocator(Allocator) {}
   SpecConstantsTy(const SpecConstantsTy &) = delete;
   SpecConstantsTy(SpecConstantsTy &&) = delete;
   SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
   SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
   SpecConstantsTy(const SpecConstantsTy &&Other)
       : ConstantIds(std::move(Other.ConstantIds)),
         ConstantValues(std::move(Other.ConstantValues)),
         Allocator(Other.Allocator) {}
   ~SpecConstantsTy() = default;

   template <typename T> void addConstant(uint32_t Id, T Val) {
     T *ValuePtr =
         reinterpret_cast<T *>(Allocator.Allocate(sizeof(T), alignof(T)));
     *ValuePtr = Val;

     ConstantIds.push_back(Id);
     ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
   }

   ze_module_constants_t getModuleConstants() const {
     ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
                               ConstantIds.data(),
                               // Unfortunately we have to const_cast it.
                               // L0 data type should probably be fixed.
                               const_cast<const void **>(ConstantValues.data())};
     return Tmp;
   }
 };

 /// L0 Plugin flags.
 struct L0OptionFlagsTy {
   uint64_t UseMemoryPool : 1;
   uint64_t Reserved : 63;
   L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
 };

 struct L0OptionsTy {
   /// Binary flags.
   L0OptionFlagsTy Flags;

   /// Staging buffer size.
   size_t StagingBufferSize = L0StagingBufferSize;

   /// Staging buffer count.
   size_t StagingBufferCount = L0StagingBufferCount;

   struct MemPoolConfigTy {
     bool Use;
     int32_t AllocMax;
     int32_t Capacity;
     int32_t PoolSize;
   };
   /// Memory pool default parameters for each allocation kind:
   /// {UseByDefault, AllocMax(MB), Capacity, PoolSize(MB)}
   std::array<MemPoolConfigTy, 3> MemPoolConfig{
       MemPoolConfigTy{true, 1, 4, 256},  // TARGET_ALLOC_DEVICE
       MemPoolConfigTy{true, 1, 4, 256},  // TARGET_ALLOC_HOST
       MemPoolConfigTy{true, 8, 4, 256}}; // TARGET_ALLOC_SHARED

   /// Parameters for memory pools dedicated to reduction scratch space.
   std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};

   /// Oversubscription rate for normal kernels.
   uint32_t SubscriptionRate = 4;

   /// Loop kernels with known ND-range may be known to have
   /// few iterations and they may not exploit the offload device
   /// to the fullest extent.
   /// Let's assume a device has N total HW threads available,
   /// and the kernel requires M hardware threads with LWS set to L.
   /// If (M < N * ThinThreadsThreshold), then we will try
   /// to iteratively divide L by 2 to increase the number of HW
   /// threads used for executing the kernel. Effectively, we will
   /// end up with L less than the kernel's SIMD width, so the HW
   /// threads will not use all their SIMD lanes. This (presumably) should
   /// allow more parallelism, because the stalls in the SIMD lanes
   /// will be distributed across more HW threads, and the probability
   /// of having a stall (or a sequence of stalls) on a critical path
   /// in the kernel should decrease.
   /// Anyway, this is just a heuristics that seems to work well for some
   /// kernels (which poorly expose parallelism in the first place).
   double ThinThreadsThreshold = 0.1;

   // Compilation options for IGC.
   // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
   // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
   // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
   // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
   // builtins.
   static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
   static constexpr std::string_view InternalCompilationOptions =
       "-cl-take-global-address";
   std::string UserCompilationOptions;

   /// Spec constants used for all modules.
   SpecConstantsTy CommonSpecConstants;

   /// Command execution mode.
   /// Whether the runtime uses asynchronous mode or not depends on the type of
   /// devices and whether immediate command list is fully enabled.
   CommandModeTy CommandMode = CommandModeTy::Async;

   /// Controls if we need to reduce available HW threads. We need this
   /// adjustment on XeHPG when Level Zero debug is enabled
   /// (ZET_ENABLE_PROGRAM_DEBUGGING=1).
   bool ZeDebugEnabled = false;

   bool Init = false; // Have the options already been processed.

   // Allocator for long-lived allocations (e.g. spec constants).
   BumpPtrAllocator Allocator;

   L0OptionsTy() : CommonSpecConstants(Allocator) {}

   /// Read environment variables.
   void processEnvironmentVars();

   void init() {
     if (!Init) {
       processEnvironmentVars();
       Init = true;
     }
   }

   bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
     return Matched.equals_insensitive(Var.get());
   }

 }; // L0OptionsTy

 } // namespace llvm::omp::target::plugin

 #endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
	//===--- Level Zero Target RTL Implementation -----------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Level Zero RTL Options support.
	//
	//===----------------------------------------------------------------------===//

	#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H
	#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H

	#include <level_zero/ze_api.h>

	#include "Shared/EnvironmentVar.h"

	#include "L0Defs.h"

	namespace llvm::omp::target::plugin {
	/// Command submission mode.
	enum class CommandModeTy { Sync = 0, Async, AsyncOrdered };

	/// Specialization constants used for a module compilation.
	class SpecConstantsTy {
	std::vector<uint32_t> ConstantIds;
	std::vector<const void *> ConstantValues;
	BumpPtrAllocator &Allocator;

	public:
	SpecConstantsTy(BumpPtrAllocator &Allocator) : Allocator(Allocator) {}
	SpecConstantsTy(const SpecConstantsTy &) = delete;
	SpecConstantsTy(SpecConstantsTy &&) = delete;
	SpecConstantsTy &operator=(const SpecConstantsTy &) = delete;
	SpecConstantsTy &operator=(const SpecConstantsTy &&) = delete;
	SpecConstantsTy(const SpecConstantsTy &&Other)
	: ConstantIds(std::move(Other.ConstantIds)),
	ConstantValues(std::move(Other.ConstantValues)),
	Allocator(Other.Allocator) {}
	~SpecConstantsTy() = default;

	template <typename T> void addConstant(uint32_t Id, T Val) {
	T *ValuePtr =
	reinterpret_cast<T *>(Allocator.Allocate(sizeof(T), alignof(T)));
	*ValuePtr = Val;

	ConstantIds.push_back(Id);
	ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
	}

	ze_module_constants_t getModuleConstants() const {
	ze_module_constants_t Tmp{static_cast<uint32_t>(ConstantValues.size()),
	ConstantIds.data(),
	// Unfortunately we have to const_cast it.
	// L0 data type should probably be fixed.
	const_cast<const void **>(ConstantValues.data())};
	return Tmp;
	}
	};

	/// L0 Plugin flags.
	struct L0OptionFlagsTy {
	uint64_t UseMemoryPool : 1;
	uint64_t Reserved : 63;
	L0OptionFlagsTy() : UseMemoryPool(1), Reserved(0) {}
	};

	struct L0OptionsTy {
	/// Binary flags.
	L0OptionFlagsTy Flags;

	/// Staging buffer size.
	size_t StagingBufferSize = L0StagingBufferSize;

	/// Staging buffer count.
	size_t StagingBufferCount = L0StagingBufferCount;

	struct MemPoolConfigTy {
	bool Use;
	int32_t AllocMax;
	int32_t Capacity;
	int32_t PoolSize;
	};
	/// Memory pool default parameters for each allocation kind:
	/// {UseByDefault, AllocMax(MB), Capacity, PoolSize(MB)}
	std::array<MemPoolConfigTy, 3> MemPoolConfig{
	MemPoolConfigTy{true, 1, 4, 256}, // TARGET_ALLOC_DEVICE
	MemPoolConfigTy{true, 1, 4, 256}, // TARGET_ALLOC_HOST
	MemPoolConfigTy{true, 8, 4, 256}}; // TARGET_ALLOC_SHARED

	/// Parameters for memory pools dedicated to reduction scratch space.
	std::array<int32_t, 3> ReductionPoolInfo{256, 8, 8192};

	/// Oversubscription rate for normal kernels.
	uint32_t SubscriptionRate = 4;

	/// Loop kernels with known ND-range may be known to have
	/// few iterations and they may not exploit the offload device
	/// to the fullest extent.
	/// Let's assume a device has N total HW threads available,
	/// and the kernel requires M hardware threads with LWS set to L.
	/// If (M < N * ThinThreadsThreshold), then we will try
	/// to iteratively divide L by 2 to increase the number of HW
	/// threads used for executing the kernel. Effectively, we will
	/// end up with L less than the kernel's SIMD width, so the HW
	/// threads will not use all their SIMD lanes. This (presumably) should
	/// allow more parallelism, because the stalls in the SIMD lanes
	/// will be distributed across more HW threads, and the probability
	/// of having a stall (or a sequence of stalls) on a critical path
	/// in the kernel should decrease.
	/// Anyway, this is just a heuristics that seems to work well for some
	/// kernels (which poorly expose parallelism in the first place).
	double ThinThreadsThreshold = 0.1;

	// Compilation options for IGC.
	// OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
	// runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
	// option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
	// builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
	// builtins.
	static constexpr std::string_view CompilationOptions = "-cl-std=CL2.0 ";
	static constexpr std::string_view InternalCompilationOptions =
	"-cl-take-global-address";
	std::string UserCompilationOptions;

	/// Spec constants used for all modules.
	SpecConstantsTy CommonSpecConstants;

	/// Command execution mode.
	/// Whether the runtime uses asynchronous mode or not depends on the type of
	/// devices and whether immediate command list is fully enabled.
	CommandModeTy CommandMode = CommandModeTy::Async;

	/// Controls if we need to reduce available HW threads. We need this
	/// adjustment on XeHPG when Level Zero debug is enabled
	/// (ZET_ENABLE_PROGRAM_DEBUGGING=1).
	bool ZeDebugEnabled = false;

	bool Init = false; // Have the options already been processed.

	// Allocator for long-lived allocations (e.g. spec constants).
	BumpPtrAllocator Allocator;

	L0OptionsTy() : CommonSpecConstants(Allocator) {}

	/// Read environment variables.
	void processEnvironmentVars();

	void init() {
	if (!Init) {
	processEnvironmentVars();
	Init = true;
	}
	}

	bool match(const StringEnvar &Var, const llvm::StringRef Matched) {
	return Matched.equals_insensitive(Var.get());
	}

	}; // L0OptionsTy

	} // namespace llvm::omp::target::plugin

	#endif // OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_LEVEL_ZERO_L0OPTIONS_H