mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp - llvm-project - Git at Google

 //===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"

 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Support/MathExtras.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cmath>
 #include <numeric>

 using namespace mlir;

 #define DEBUG_TYPE "linalg-transforms"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

 static Attribute linearId0(MLIRContext *ctx) {
   return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
 }
 static Attribute linearId1(MLIRContext *ctx) {
   return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
 }
 static Attribute linearId2(MLIRContext *ctx) {
   return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
 }

 transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
                                                  int totalNumThreads,
                                                  int64_t desiredBitAlignment,
                                                  ArrayRef<int64_t> copySizes,
                                                  bool favorPredication,
                                                  int64_t elementalBitwidth) {
   assert(!copySizes.empty() && copySizes.size() <= 3 &&
          "only 1,2,3-D copies are supported for now");

   LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
   LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: ");
              llvm::dbgs() << "\n";);

   // Greedily find the largest vector size that can be used to copy the most
   // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
   // contiguous memory transactions with as few threads as possible.
   int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
       desiredBitAlignment, copySizes.back(), elementalBitwidth);

   LDBG("--greedily determined vectorSize: "
        << desiredVectorSize << " elements of " << elementalBitwidth
        << "b each -> " << (desiredVectorSize * elementalBitwidth)
        << "b total out of a max of " << kMaxVectorLoadBitWidth << "b");

   status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
                            favorPredication);
   if (status == Status::Invalid)
     return;

   LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: ");
              llvm::dbgs() << "\n"; llvm::interleaveComma(
                  this->numThreads, DBGS() << "--numThreads: ");
              llvm::dbgs() << "\n";);
   LDBG("--vectorSize: " << this->vectorSize);
   assert(this->numThreads.size() == copySizes.size() &&
          "compute copy mapping expected same number of threads and copy sizes");

   // Compute the smallest bounding box.
   this->smallestBoundingTileSizes = llvm::to_vector(
       llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
         int64_t size, numThreads;
         std::tie(size, numThreads) = pair;
         return mlir::ceilDiv(size, numThreads);
       }));
   SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
                                            linearId0(ctx)};

   // Set the thread mapping.
   this->threadMapping =
       llvm::to_vector(ArrayRef(allThreadMappings)
                           .take_back(this->smallestBoundingTileSizes.size()));
   LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
 }

 int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
     int64_t desiredBitAlignment, int64_t numContiguousElements,
     int64_t elementalBitwidth) {
   assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
          "elemental bitwidth does not divide kMaxVectorLoadBitWidth");
   assert(desiredBitAlignment % elementalBitwidth == 0 &&
          "elemental bitwidth does not divide desired bit alignment");
   return std::gcd(
       std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
       kMaxVectorLoadBitWidth / elementalBitwidth);
 }

 /// Get the list of all factors that divide `val`, not just the prime factors.
 static SmallVector<int64_t> getFactors(int64_t val) {
   SmallVector<int64_t> factors;
   factors.reserve(val);
   for (int64_t factor = 1; factor <= val; ++factor) {
     if (val % factor != 0)
       continue;
     factors.push_back(factor);
   }
   factors.push_back(val);
   return factors;
 }

 static int64_t product(ArrayRef<int64_t> vals) {
   int64_t res = 1;
   for (auto val : vals)
     res *= val;
   return res;
 }

 /// Extract `result` from `sizes` with the following constraints:
 ///   1. sizes[i] % result[i] for all i
 ///   2. product_of_threadsPerDim <= maxNumThreads
 ///   3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
 ///      must be sizes[currentIndex].
 /// This is used to greedily extract the maximum number of threads usable for
 /// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
 /// ensuring coalesced access along the most minor dimension.
 /// Return the number of threads used in the range:
 ///   threadsPerDim[currentIndex .. sizes.end()]
 // The implementation uses a dynamic programming approach to greedily extract
 // the best combination under the constraints.
 // TODO: Implementation details can be improved but putting effort there is a
 // tradeoffs: `sizes` is expected to be of small rank and contain small values.
 static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
                                                int64_t currentIndex,
                                                int64_t maxNumThreads) {
   assert(static_cast<size_t>(currentIndex) < sizes.size() &&
          "currentIndex out of bounds");
   std::string indent(2 * currentIndex, '-');
   if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
     LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
     return SmallVector<int64_t>{sizes[currentIndex]};
   }

   int64_t best = 0;
   int64_t s = sizes[currentIndex];
   SmallVector<int64_t> factors = getFactors(s);
   SmallVector<int64_t> localThreadsPerDim;
   localThreadsPerDim.reserve(sizes.size());
   LDBG(indent << "maximizeNumThreads in " << s
               << " with limit: " << maxNumThreads);
   for (auto factor : factors) {
     auto nestedThreadsPerDim =
         maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);
     int64_t localBest = factor * product(nestedThreadsPerDim);
     if (localBest > best && localBest <= maxNumThreads) {
       LDBG(indent << "new localBest: " << localBest);
       LLVM_DEBUG(
           llvm::interleaveComma(nestedThreadsPerDim,
                                 DBGS() << indent << "nestedThreadsPerDim: ");
           llvm::dbgs() << "\n";);
       localThreadsPerDim.clear();
       localThreadsPerDim.push_back(factor);
       llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
       best = localBest;
     }
   }

   LDBG(indent << "found globalBest: " << best);
   LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim,
                                    DBGS() << indent << "numThreads: ");
              llvm::dbgs() << "\n";);

   return localThreadsPerDim;
 }

 transform::gpu::CopyMappingInfo::Status
 transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
                                                  ArrayRef<int64_t> sizes,
                                                  int64_t desiredVectorSize,
                                                  bool favorPredication) {

   if (!favorPredication) {
     int64_t localVectorSize = desiredVectorSize;
     for (; localVectorSize >= 1; localVectorSize /= 2) {
       // Attempt to map the copy with predication and current fixed vector size:
       //   1. if the status is Success, we are done.
       //   2. if the status is Invalid, we fail immediately, no amount of
       //   vector size reduction can offset the bad tile size selection from the
       //   higher-level.
       //   3. if the status is RequiresPredication, we try again with a smaller
       //   vector size.
       Status status =
           inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
       if (status == Status::Success || status == Status::Invalid)
         return status;

       LDBG("requires predication, try reducing vector size to "
            << (localVectorSize / 2));
     }
   }

   // If we have not yet returned, it means that we have tried all vector sizes
   // and we still require predication. Restart from the original vector size and
   // do not attempt to
   return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
 }

 transform::gpu::CopyMappingInfo::Status
 transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
     int64_t totalNumThreads, ArrayRef<int64_t> sizes,
     int64_t desiredVectorSize) {
   assert(sizes.back() % desiredVectorSize == 0 &&
          "most-minor size not divisible by actualVectorSize");

   LDBG("inferNumThreadsImpl with totalNumThreads: "
        << totalNumThreads << " and vectorSize: " << desiredVectorSize);

   // Scale the most minor size to account for the chosen vector size and
   // maximize the number of threads without exceeding the total number of
   // threads.
   SmallVector<int64_t> scaledSizes{sizes};
   scaledSizes.back() /= desiredVectorSize;
   if (scaledSizes.back() > totalNumThreads) {
     LDBG("--Too few threads given the required vector size -> FAIL");
     return Status::Invalid;
   }
   SmallVector<int64_t> inferredNumThreads =
       maximizeNumThreads(scaledSizes, 0, totalNumThreads);

   LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads,
                                    DBGS() << "inferred numThreads: ");
              llvm::dbgs() << "\n";
              LDBG("computed actualVectorSize: " << desiredVectorSize););

   // Corner case: we cannot use more threads than available. If the dimension of
   // the copy is so bad it is because higher-level tiling did not do its job, we
   // do not try to recover from it here.
   int64_t totalNumThreadsUsed = product(inferredNumThreads);
   LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
   if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
     LDBG("--Too few threads given the required vector size -> FAIL");
     return Status::Invalid;
   }

   this->vectorSize = desiredVectorSize;
   this->numThreads = inferredNumThreads;
   if (totalNumThreadsUsed == totalNumThreads)
     return Status::Success;

   return Status::RequiresPredication;
 }

 void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
   os << "MappingInfo{";
   os << "CopyMappingInfo: ";
   os << "valid: " << (status != Status::Invalid) << ", ";
   os << "vectorSize: " << vectorSize << ", ";
   llvm::interleaveComma(numThreads, os << ", numThreads: {");
   llvm::interleaveComma(smallestBoundingTileSizes,
                         os << "}, smallestBoundingTileSizes: {");
   llvm::interleaveComma(threadMapping, os << "}, threadMapping: {");
   os << "}}";
 }
	//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"

	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Support/MathExtras.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cmath>
	#include <numeric>

	using namespace mlir;

	#define DEBUG_TYPE "linalg-transforms"
	#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
	#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")

	static Attribute linearId0(MLIRContext *ctx) {
	return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
	}
	static Attribute linearId1(MLIRContext *ctx) {
	return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
	}
	static Attribute linearId2(MLIRContext *ctx) {
	return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
	}

	transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
	int totalNumThreads,
	int64_t desiredBitAlignment,
	ArrayRef<int64_t> copySizes,
	bool favorPredication,
	int64_t elementalBitwidth) {
	assert(!copySizes.empty() && copySizes.size() <= 3 &&
	"only 1,2,3-D copies are supported for now");

	LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
	LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: ");
	llvm::dbgs() << "\n";);

	// Greedily find the largest vector size that can be used to copy the most
	// minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
	// contiguous memory transactions with as few threads as possible.
	int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
	desiredBitAlignment, copySizes.back(), elementalBitwidth);

	LDBG("--greedily determined vectorSize: "
	<< desiredVectorSize << " elements of " << elementalBitwidth
	<< "b each -> " << (desiredVectorSize * elementalBitwidth)
	<< "b total out of a max of " << kMaxVectorLoadBitWidth << "b");

	status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
	favorPredication);
	if (status == Status::Invalid)
	return;

	LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: ");
	llvm::dbgs() << "\n"; llvm::interleaveComma(
	this->numThreads, DBGS() << "--numThreads: ");
	llvm::dbgs() << "\n";);
	LDBG("--vectorSize: " << this->vectorSize);
	assert(this->numThreads.size() == copySizes.size() &&
	"compute copy mapping expected same number of threads and copy sizes");

	// Compute the smallest bounding box.
	this->smallestBoundingTileSizes = llvm::to_vector(
	llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
	int64_t size, numThreads;
	std::tie(size, numThreads) = pair;
	return mlir::ceilDiv(size, numThreads);
	}));
	SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
	linearId0(ctx)};

	// Set the thread mapping.
	this->threadMapping =
	llvm::to_vector(ArrayRef(allThreadMappings)
	.take_back(this->smallestBoundingTileSizes.size()));
	LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
	}

	int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
	int64_t desiredBitAlignment, int64_t numContiguousElements,
	int64_t elementalBitwidth) {
	assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
	"elemental bitwidth does not divide kMaxVectorLoadBitWidth");
	assert(desiredBitAlignment % elementalBitwidth == 0 &&
	"elemental bitwidth does not divide desired bit alignment");
	return std::gcd(
	std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
	kMaxVectorLoadBitWidth / elementalBitwidth);
	}

	/// Get the list of all factors that divide `val`, not just the prime factors.
	static SmallVector<int64_t> getFactors(int64_t val) {
	SmallVector<int64_t> factors;
	factors.reserve(val);
	for (int64_t factor = 1; factor <= val; ++factor) {
	if (val % factor != 0)
	continue;
	factors.push_back(factor);
	}
	factors.push_back(val);
	return factors;
	}

	static int64_t product(ArrayRef<int64_t> vals) {
	int64_t res = 1;
	for (auto val : vals)
	res *= val;
	return res;
	}

	/// Extract `result` from `sizes` with the following constraints:
	/// 1. sizes[i] % result[i] for all i
	/// 2. product_of_threadsPerDim <= maxNumThreads
	/// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
	/// must be sizes[currentIndex].
	/// This is used to greedily extract the maximum number of threads usable for
	/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
	/// ensuring coalesced access along the most minor dimension.
	/// Return the number of threads used in the range:
	/// threadsPerDim[currentIndex .. sizes.end()]
	// The implementation uses a dynamic programming approach to greedily extract
	// the best combination under the constraints.
	// TODO: Implementation details can be improved but putting effort there is a
	// tradeoffs: `sizes` is expected to be of small rank and contain small values.
	static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
	int64_t currentIndex,
	int64_t maxNumThreads) {
	assert(static_cast<size_t>(currentIndex) < sizes.size() &&
	"currentIndex out of bounds");
	std::string indent(2 * currentIndex, '-');
	if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
	LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
	return SmallVector<int64_t>{sizes[currentIndex]};
	}

	int64_t best = 0;
	int64_t s = sizes[currentIndex];
	SmallVector<int64_t> factors = getFactors(s);
	SmallVector<int64_t> localThreadsPerDim;
	localThreadsPerDim.reserve(sizes.size());
	LDBG(indent << "maximizeNumThreads in " << s
	<< " with limit: " << maxNumThreads);
	for (auto factor : factors) {
	auto nestedThreadsPerDim =
	maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);
	int64_t localBest = factor * product(nestedThreadsPerDim);
	if (localBest > best && localBest <= maxNumThreads) {
	LDBG(indent << "new localBest: " << localBest);
	LLVM_DEBUG(
	llvm::interleaveComma(nestedThreadsPerDim,
	DBGS() << indent << "nestedThreadsPerDim: ");
	llvm::dbgs() << "\n";);
	localThreadsPerDim.clear();
	localThreadsPerDim.push_back(factor);
	llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
	best = localBest;
	}
	}

	LDBG(indent << "found globalBest: " << best);
	LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim,
	DBGS() << indent << "numThreads: ");
	llvm::dbgs() << "\n";);

	return localThreadsPerDim;
	}

	transform::gpu::CopyMappingInfo::Status
	transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
	ArrayRef<int64_t> sizes,
	int64_t desiredVectorSize,
	bool favorPredication) {

	if (!favorPredication) {
	int64_t localVectorSize = desiredVectorSize;
	for (; localVectorSize >= 1; localVectorSize /= 2) {
	// Attempt to map the copy with predication and current fixed vector size:
	// 1. if the status is Success, we are done.
	// 2. if the status is Invalid, we fail immediately, no amount of
	// vector size reduction can offset the bad tile size selection from the
	// higher-level.
	// 3. if the status is RequiresPredication, we try again with a smaller
	// vector size.
	Status status =
	inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
	if (status == Status::Success \|\| status == Status::Invalid)
	return status;

	LDBG("requires predication, try reducing vector size to "
	<< (localVectorSize / 2));
	}
	}

	// If we have not yet returned, it means that we have tried all vector sizes
	// and we still require predication. Restart from the original vector size and
	// do not attempt to
	return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
	}

	transform::gpu::CopyMappingInfo::Status
	transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
	int64_t totalNumThreads, ArrayRef<int64_t> sizes,
	int64_t desiredVectorSize) {
	assert(sizes.back() % desiredVectorSize == 0 &&
	"most-minor size not divisible by actualVectorSize");

	LDBG("inferNumThreadsImpl with totalNumThreads: "
	<< totalNumThreads << " and vectorSize: " << desiredVectorSize);

	// Scale the most minor size to account for the chosen vector size and
	// maximize the number of threads without exceeding the total number of
	// threads.
	SmallVector<int64_t> scaledSizes{sizes};
	scaledSizes.back() /= desiredVectorSize;
	if (scaledSizes.back() > totalNumThreads) {
	LDBG("--Too few threads given the required vector size -> FAIL");
	return Status::Invalid;
	}
	SmallVector<int64_t> inferredNumThreads =
	maximizeNumThreads(scaledSizes, 0, totalNumThreads);

	LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads,
	DBGS() << "inferred numThreads: ");
	llvm::dbgs() << "\n";
	LDBG("computed actualVectorSize: " << desiredVectorSize););

	// Corner case: we cannot use more threads than available. If the dimension of
	// the copy is so bad it is because higher-level tiling did not do its job, we
	// do not try to recover from it here.
	int64_t totalNumThreadsUsed = product(inferredNumThreads);
	LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
	if (totalNumThreadsUsed == 0 \|\| totalNumThreadsUsed > totalNumThreads) {
	LDBG("--Too few threads given the required vector size -> FAIL");
	return Status::Invalid;
	}

	this->vectorSize = desiredVectorSize;
	this->numThreads = inferredNumThreads;
	if (totalNumThreadsUsed == totalNumThreads)
	return Status::Success;

	return Status::RequiresPredication;
	}

	void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
	os << "MappingInfo{";
	os << "CopyMappingInfo: ";
	os << "valid: " << (status != Status::Invalid) << ", ";
	os << "vectorSize: " << vectorSize << ", ";
	llvm::interleaveComma(numThreads, os << ", numThreads: {");
	llvm::interleaveComma(smallestBoundingTileSizes,
	os << "}, smallestBoundingTileSizes: {");
	llvm::interleaveComma(threadMapping, os << "}, threadMapping: {");
	os << "}}";
	}