blob: f9fd32c2031afb6d565fcc2b9da35eb3b4118f8d [file] [log] [blame]
//===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
#define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
#include "mlir/IR/Attributes.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/Support/LogicalResult.h"
namespace mlir {
namespace transform {
namespace gpu {
/// Base struct to hold GPU mapping information for a given operation.
struct MappingInfo {
/// Number of threads to use for the mapping.
/// Note: When the number of threads used is smaller than the total number of
/// available threads, predication ensues. It is often useful to use more
/// threads and saturate memory bandwidth for some operations, even if others
/// end up being predicated.
SmallVector<int64_t> numThreads;
/// Thread mapping attributes, one per entry of `numThreads`.
SmallVector<Attribute> threadMapping;
};
struct CopyMappingInfo : public MappingInfo {
/// Status of the mapping computation, invalid usually means too many threads
/// are required and we fail to map. This usually happens when the copy is too
/// large compared to the number of threads.
enum class Status { Success = 0, RequiresPredication, Invalid };
/// Greedily compute the MappingInfo to use to perform a copy of `sizes`
/// elements of bitwidth `elementalBitwidth`.
/// The `desiredBitAlignment` is the number of elements by which the most
/// minor dimension of the copy is expected to be aligned.
/// This is an approximation of the final alignment, for each row of the copy.
/// This is used to restrict the size of copied vector so that they match
/// potential subsequent cp.async.
/// If the alignment does not match the required alignment for a cp.async down
/// the line, the conversion to cp.async will be eventually skipped, possibly
/// degrading performance.
/// When `favorPredication` is false, the mapping is computed to fill all
/// threads with an equal amount of data to copy, so as to avoid predication.
/// Predication ends up requiring a split epilogue in current pipelining
/// implementations and is better avoided when possible.
CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,
int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,
bool favorPredication = false,
int64_t elementalBitwidth = 32);
private:
/// Determine the maximal vector size to use to copy a contiguous array of
/// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
/// The `alignment` is the number of elements by which the most minor
/// dimension of the copy is aligned. This is an approximation of actual
/// memory alignment after bufferization, for each row of the copy. This is
/// used to restrict the of the copied vector so that it is properly aligned
/// with the requirements of cp.async. If the copy alignment does not match
/// the required aligned for a cp.async, thae conversion to cp.async will be
/// skipped.
/// Asserts that `elementalBitwidth` divides `numContiguousElements`.
static int64_t
maxContiguousElementsToTransfer(int64_t alignment,
int64_t numContiguousElements,
int64_t elementalBitwidth = 32);
/// Compute the number of threads to use to perform a copy of `sizes`
/// elements of `elementalBitwidth`.
/// The `alignment` is the number of elements by which the most minor
/// dimension of the copy is aligned. This is an approximation of actual
/// memory alignment after bufferization, for each row of the copy. This is
/// used to restrict the of the copied vector so that it is properly aligned
/// with the requirements of cp.async. If the copy alignment does not match
/// the required aligned for a cp.async, the conversion to cp.async will be
/// skipped.
/// When `favorPredication` is false, the implementation avoids predication
/// in the copy, even if it means reducing the granularity of the transfer.
/// Otherwise, the implementation will come up with a maximal assignment of
/// the remaining threads to sizes of interest, using a DP implementation.
Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
int64_t desiredVectorSize, bool favorPredication);
Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
int64_t desiredVectorSize);
public:
// Pretty-printing and diagnostic methods.
void print(llvm::raw_ostream &os) const;
LLVM_DUMP_METHOD void dump() const;
/// Static quantity determining the number of bits to target in an individual
/// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
/// transfer sizes. In the future we should have more hardware pluggability
/// here, especially when we want sub-byte granularity
static constexpr int64_t kMaxVectorLoadBitWidth = 128;
/// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
int64_t vectorSize;
/// Number of threads to use for the copy mapping, from most major to most
/// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
/// for best coalescing).
using MappingInfo::numThreads;
/// Explicit computation / injection of the smallest bounding tile sizes after
/// mapping to `numThreads`. This is useful in masked scenarios.
SmallVector<int64_t> smallestBoundingTileSizes;
/// Thread mapping attributes, one per entry of `numThreads`.
using MappingInfo::threadMapping;
/// The status of a particular copy mapping. Must be checked before applying
/// transformations.
Status status;
};
} // namespace gpu
} // namespace transform
} // namespace mlir
#endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H