include/mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h - llvm-project/mlir - Git at Google

 //===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
 #define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H

 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LogicalResult.h"

 namespace mlir {
 namespace transform {
 namespace gpu {

 /// Base struct to hold GPU mapping information for a given operation.
 struct MappingInfo {
   /// Number of threads to use for the mapping.
   /// Note: When the number of threads used is smaller than the total number of
   /// available threads, predication ensues. It is often useful to use more
   /// threads and saturate memory bandwidth for some operations, even if others
   /// end up being predicated.
   SmallVector<int64_t> numThreads;

   /// Thread mapping attributes, one per entry of `numThreads`.
   SmallVector<Attribute> threadMapping;
 };

 struct CopyMappingInfo : public MappingInfo {
   /// Status of the mapping computation, invalid usually means too many threads
   /// are required and we fail to map. This usually happens when the copy is too
   /// large compared to the number of threads.
   enum class Status { Success = 0, RequiresPredication, Invalid };

   /// Greedily compute the MappingInfo to use to perform a copy of `sizes`
   /// elements of bitwidth `elementalBitwidth`.
   /// The `desiredBitAlignment` is the number of elements by which the most
   /// minor dimension of the copy is expected to be aligned.
   /// This is an approximation of the final alignment, for each row of the copy.
   /// This is used to restrict the size of copied vector so that they match
   /// potential subsequent cp.async.
   /// If the alignment does not match the required alignment for a cp.async down
   /// the line, the conversion to cp.async will be eventually skipped, possibly
   /// degrading performance.
   /// When `favorPredication` is false, the mapping is computed to fill all
   /// threads with an equal amount of data to copy, so as to avoid predication.
   /// Predication ends up requiring a split epilogue in current pipelining
   /// implementations and is better avoided when possible.
   CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,
                   int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,
                   bool favorPredication = false,
                   int64_t elementalBitwidth = 32);

 private:
   /// Determine the maximal vector size to use to copy a contiguous array of
   /// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
   /// The `alignment` is the number of elements by which the most minor
   /// dimension of the copy is aligned. This is an approximation of actual
   /// memory alignment after bufferization, for each row of the copy. This is
   /// used to restrict the of the copied vector so that it is properly aligned
   /// with the requirements of cp.async. If the copy alignment does not match
   /// the required aligned for a cp.async, thae conversion to cp.async will be
   /// skipped.
   /// Asserts that `elementalBitwidth` divides `numContiguousElements`.
   static int64_t
   maxContiguousElementsToTransfer(int64_t alignment,
                                   int64_t numContiguousElements,
                                   int64_t elementalBitwidth = 32);

   /// Compute the number of threads to use to perform a copy of `sizes`
   /// elements of `elementalBitwidth`.
   /// The `alignment` is the number of elements by which the most minor
   /// dimension of the copy is aligned. This is an approximation of actual
   /// memory alignment after bufferization, for each row of the copy. This is
   /// used to restrict the of the copied vector so that it is properly aligned
   /// with the requirements of cp.async. If the copy alignment does not match
   /// the required aligned for a cp.async, the conversion to cp.async will be
   /// skipped.
   /// When `favorPredication` is false, the implementation avoids predication
   /// in the copy, even if it means reducing the granularity of the transfer.
   /// Otherwise, the implementation will come up with a maximal assignment of
   /// the remaining threads to sizes of interest, using a DP implementation.
   Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
                          int64_t desiredVectorSize, bool favorPredication);
   Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
                              int64_t desiredVectorSize);

 public:
   // Pretty-printing and diagnostic methods.
   void print(llvm::raw_ostream &os) const;
   LLVM_DUMP_METHOD void dump() const;

   /// Static quantity determining the number of bits to target in an individual
   /// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
   /// transfer sizes. In the future we should have more hardware pluggability
   /// here, especially when we want sub-byte granularity
   static constexpr int64_t kMaxVectorLoadBitWidth = 128;

   /// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
   int64_t vectorSize;

   /// Number of threads to use for the copy mapping, from most major to most
   /// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
   /// for best coalescing).
   using MappingInfo::numThreads;

   /// Explicit computation / injection of the smallest bounding tile sizes after
   /// mapping to `numThreads`. This is useful in masked scenarios.
   SmallVector<int64_t> smallestBoundingTileSizes;

   /// Thread mapping attributes, one per entry of `numThreads`.
   using MappingInfo::threadMapping;

   /// The status of a particular copy mapping. Must be checked before applying
   /// transformations.
   Status status;
 };

 } // namespace gpu
 } // namespace transform
 } // namespace mlir

 #endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
	//===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
	#define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H

	#include "mlir/IR/Attributes.h"
	#include "mlir/IR/MLIRContext.h"
	#include "mlir/Support/LogicalResult.h"

	namespace mlir {
	namespace transform {
	namespace gpu {

	/// Base struct to hold GPU mapping information for a given operation.
	struct MappingInfo {
	/// Number of threads to use for the mapping.
	/// Note: When the number of threads used is smaller than the total number of
	/// available threads, predication ensues. It is often useful to use more
	/// threads and saturate memory bandwidth for some operations, even if others
	/// end up being predicated.
	SmallVector<int64_t> numThreads;

	/// Thread mapping attributes, one per entry of `numThreads`.
	SmallVector<Attribute> threadMapping;
	};

	struct CopyMappingInfo : public MappingInfo {
	/// Status of the mapping computation, invalid usually means too many threads
	/// are required and we fail to map. This usually happens when the copy is too
	/// large compared to the number of threads.
	enum class Status { Success = 0, RequiresPredication, Invalid };

	/// Greedily compute the MappingInfo to use to perform a copy of `sizes`
	/// elements of bitwidth `elementalBitwidth`.
	/// The `desiredBitAlignment` is the number of elements by which the most
	/// minor dimension of the copy is expected to be aligned.
	/// This is an approximation of the final alignment, for each row of the copy.
	/// This is used to restrict the size of copied vector so that they match
	/// potential subsequent cp.async.
	/// If the alignment does not match the required alignment for a cp.async down
	/// the line, the conversion to cp.async will be eventually skipped, possibly
	/// degrading performance.
	/// When `favorPredication` is false, the mapping is computed to fill all
	/// threads with an equal amount of data to copy, so as to avoid predication.
	/// Predication ends up requiring a split epilogue in current pipelining
	/// implementations and is better avoided when possible.
	CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,
	int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,
	bool favorPredication = false,
	int64_t elementalBitwidth = 32);

	private:
	/// Determine the maximal vector size to use to copy a contiguous array of
	/// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
	/// The `alignment` is the number of elements by which the most minor
	/// dimension of the copy is aligned. This is an approximation of actual
	/// memory alignment after bufferization, for each row of the copy. This is
	/// used to restrict the of the copied vector so that it is properly aligned
	/// with the requirements of cp.async. If the copy alignment does not match
	/// the required aligned for a cp.async, thae conversion to cp.async will be
	/// skipped.
	/// Asserts that `elementalBitwidth` divides `numContiguousElements`.
	static int64_t
	maxContiguousElementsToTransfer(int64_t alignment,
	int64_t numContiguousElements,
	int64_t elementalBitwidth = 32);

	/// Compute the number of threads to use to perform a copy of `sizes`
	/// elements of `elementalBitwidth`.
	/// The `alignment` is the number of elements by which the most minor
	/// dimension of the copy is aligned. This is an approximation of actual
	/// memory alignment after bufferization, for each row of the copy. This is
	/// used to restrict the of the copied vector so that it is properly aligned
	/// with the requirements of cp.async. If the copy alignment does not match
	/// the required aligned for a cp.async, the conversion to cp.async will be
	/// skipped.
	/// When `favorPredication` is false, the implementation avoids predication
	/// in the copy, even if it means reducing the granularity of the transfer.
	/// Otherwise, the implementation will come up with a maximal assignment of
	/// the remaining threads to sizes of interest, using a DP implementation.
	Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
	int64_t desiredVectorSize, bool favorPredication);
	Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
	int64_t desiredVectorSize);

	public:
	// Pretty-printing and diagnostic methods.
	void print(llvm::raw_ostream &os) const;
	LLVM_DUMP_METHOD void dump() const;

	/// Static quantity determining the number of bits to target in an individual
	/// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
	/// transfer sizes. In the future we should have more hardware pluggability
	/// here, especially when we want sub-byte granularity
	static constexpr int64_t kMaxVectorLoadBitWidth = 128;

	/// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
	int64_t vectorSize;

	/// Number of threads to use for the copy mapping, from most major to most
	/// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
	/// for best coalescing).
	using MappingInfo::numThreads;

	/// Explicit computation / injection of the smallest bounding tile sizes after
	/// mapping to `numThreads`. This is useful in masked scenarios.
	SmallVector<int64_t> smallestBoundingTileSizes;

	/// Thread mapping attributes, one per entry of `numThreads`.
	using MappingInfo::threadMapping;

	/// The status of a particular copy mapping. Must be checked before applying
	/// transformations.
	Status status;
	};

	} // namespace gpu
	} // namespace transform
	} // namespace mlir

	#endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H