| //===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h" |
| |
| #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| #include "mlir/Support/MathExtras.h" |
| #include "llvm/ADT/ArrayRef.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include <cmath> |
| #include <numeric> |
| |
| using namespace mlir; |
| |
| #define DEBUG_TYPE "linalg-transforms" |
| #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
| #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
| |
| static Attribute linearId0(MLIRContext *ctx) { |
| return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0); |
| } |
| static Attribute linearId1(MLIRContext *ctx) { |
| return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1); |
| } |
| static Attribute linearId2(MLIRContext *ctx) { |
| return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2); |
| } |
| |
| transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx, |
| int totalNumThreads, |
| int64_t desiredBitAlignment, |
| ArrayRef<int64_t> copySizes, |
| bool favorPredication, |
| int64_t elementalBitwidth) { |
| assert(!copySizes.empty() && copySizes.size() <= 3 && |
| "only 1,2,3-D copies are supported for now"); |
| |
| LDBG("START CopyMappingInfo, favorPredication: " << favorPredication); |
| LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: "); |
| llvm::dbgs() << "\n";); |
| |
| // Greedily find the largest vector size that can be used to copy the most |
| // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth |
| // contiguous memory transactions with as few threads as possible. |
| int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer( |
| desiredBitAlignment, copySizes.back(), elementalBitwidth); |
| |
| LDBG("--greedily determined vectorSize: " |
| << desiredVectorSize << " elements of " << elementalBitwidth |
| << "b each -> " << (desiredVectorSize * elementalBitwidth) |
| << "b total out of a max of " << kMaxVectorLoadBitWidth << "b"); |
| |
| status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize, |
| favorPredication); |
| if (status == Status::Invalid) |
| return; |
| |
| LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: "); |
| llvm::dbgs() << "\n"; llvm::interleaveComma( |
| this->numThreads, DBGS() << "--numThreads: "); |
| llvm::dbgs() << "\n";); |
| LDBG("--vectorSize: " << this->vectorSize); |
| assert(this->numThreads.size() == copySizes.size() && |
| "compute copy mapping expected same number of threads and copy sizes"); |
| |
| // Compute the smallest bounding box. |
| this->smallestBoundingTileSizes = llvm::to_vector( |
| llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) { |
| int64_t size, numThreads; |
| std::tie(size, numThreads) = pair; |
| return mlir::ceilDiv(size, numThreads); |
| })); |
| SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx), |
| linearId0(ctx)}; |
| |
| // Set the thread mapping. |
| this->threadMapping = |
| llvm::to_vector(ArrayRef(allThreadMappings) |
| .take_back(this->smallestBoundingTileSizes.size())); |
| LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n"); |
| } |
| |
| int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer( |
| int64_t desiredBitAlignment, int64_t numContiguousElements, |
| int64_t elementalBitwidth) { |
| assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 && |
| "elemental bitwidth does not divide kMaxVectorLoadBitWidth"); |
| assert(desiredBitAlignment % elementalBitwidth == 0 && |
| "elemental bitwidth does not divide desired bit alignment"); |
| return std::gcd( |
| std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements), |
| kMaxVectorLoadBitWidth / elementalBitwidth); |
| } |
| |
| /// Get the list of all factors that divide `val`, not just the prime factors. |
| static SmallVector<int64_t> getFactors(int64_t val) { |
| SmallVector<int64_t> factors; |
| factors.reserve(val); |
| for (int64_t factor = 1; factor <= val; ++factor) { |
| if (val % factor != 0) |
| continue; |
| factors.push_back(factor); |
| } |
| factors.push_back(val); |
| return factors; |
| } |
| |
| static int64_t product(ArrayRef<int64_t> vals) { |
| int64_t res = 1; |
| for (auto val : vals) |
| res *= val; |
| return res; |
| } |
| |
| /// Extract `result` from `sizes` with the following constraints: |
| /// 1. sizes[i] % result[i] for all i |
| /// 2. product_of_threadsPerDim <= maxNumThreads |
| /// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex] |
| /// must be sizes[currentIndex]. |
| /// This is used to greedily extract the maximum number of threads usable for |
| /// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and |
| /// ensuring coalesced access along the most minor dimension. |
| /// Return the number of threads used in the range: |
| /// threadsPerDim[currentIndex .. sizes.end()] |
| // The implementation uses a dynamic programming approach to greedily extract |
| // the best combination under the constraints. |
| // TODO: Implementation details can be improved but putting effort there is a |
| // tradeoffs: `sizes` is expected to be of small rank and contain small values. |
| static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes, |
| int64_t currentIndex, |
| int64_t maxNumThreads) { |
| assert(static_cast<size_t>(currentIndex) < sizes.size() && |
| "currentIndex out of bounds"); |
| std::string indent(2 * currentIndex, '-'); |
| if (static_cast<size_t>(currentIndex) == sizes.size() - 1) { |
| LDBG(indent << "mandated globalBest: " << sizes[currentIndex]); |
| return SmallVector<int64_t>{sizes[currentIndex]}; |
| } |
| |
| int64_t best = 0; |
| int64_t s = sizes[currentIndex]; |
| SmallVector<int64_t> factors = getFactors(s); |
| SmallVector<int64_t> localThreadsPerDim; |
| localThreadsPerDim.reserve(sizes.size()); |
| LDBG(indent << "maximizeNumThreads in " << s |
| << " with limit: " << maxNumThreads); |
| for (auto factor : factors) { |
| auto nestedThreadsPerDim = |
| maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor); |
| int64_t localBest = factor * product(nestedThreadsPerDim); |
| if (localBest > best && localBest <= maxNumThreads) { |
| LDBG(indent << "new localBest: " << localBest); |
| LLVM_DEBUG( |
| llvm::interleaveComma(nestedThreadsPerDim, |
| DBGS() << indent << "nestedThreadsPerDim: "); |
| llvm::dbgs() << "\n";); |
| localThreadsPerDim.clear(); |
| localThreadsPerDim.push_back(factor); |
| llvm::append_range(localThreadsPerDim, nestedThreadsPerDim); |
| best = localBest; |
| } |
| } |
| |
| LDBG(indent << "found globalBest: " << best); |
| LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim, |
| DBGS() << indent << "numThreads: "); |
| llvm::dbgs() << "\n";); |
| |
| return localThreadsPerDim; |
| } |
| |
| transform::gpu::CopyMappingInfo::Status |
| transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads, |
| ArrayRef<int64_t> sizes, |
| int64_t desiredVectorSize, |
| bool favorPredication) { |
| |
| if (!favorPredication) { |
| int64_t localVectorSize = desiredVectorSize; |
| for (; localVectorSize >= 1; localVectorSize /= 2) { |
| // Attempt to map the copy with predication and current fixed vector size: |
| // 1. if the status is Success, we are done. |
| // 2. if the status is Invalid, we fail immediately, no amount of |
| // vector size reduction can offset the bad tile size selection from the |
| // higher-level. |
| // 3. if the status is RequiresPredication, we try again with a smaller |
| // vector size. |
| Status status = |
| inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize); |
| if (status == Status::Success || status == Status::Invalid) |
| return status; |
| |
| LDBG("requires predication, try reducing vector size to " |
| << (localVectorSize / 2)); |
| } |
| } |
| |
| // If we have not yet returned, it means that we have tried all vector sizes |
| // and we still require predication. Restart from the original vector size and |
| // do not attempt to |
| return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize); |
| } |
| |
| transform::gpu::CopyMappingInfo::Status |
| transform::gpu::CopyMappingInfo::inferNumThreadsImpl( |
| int64_t totalNumThreads, ArrayRef<int64_t> sizes, |
| int64_t desiredVectorSize) { |
| assert(sizes.back() % desiredVectorSize == 0 && |
| "most-minor size not divisible by actualVectorSize"); |
| |
| LDBG("inferNumThreadsImpl with totalNumThreads: " |
| << totalNumThreads << " and vectorSize: " << desiredVectorSize); |
| |
| // Scale the most minor size to account for the chosen vector size and |
| // maximize the number of threads without exceeding the total number of |
| // threads. |
| SmallVector<int64_t> scaledSizes{sizes}; |
| scaledSizes.back() /= desiredVectorSize; |
| if (scaledSizes.back() > totalNumThreads) { |
| LDBG("--Too few threads given the required vector size -> FAIL"); |
| return Status::Invalid; |
| } |
| SmallVector<int64_t> inferredNumThreads = |
| maximizeNumThreads(scaledSizes, 0, totalNumThreads); |
| |
| LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads, |
| DBGS() << "inferred numThreads: "); |
| llvm::dbgs() << "\n"; |
| LDBG("computed actualVectorSize: " << desiredVectorSize);); |
| |
| // Corner case: we cannot use more threads than available. If the dimension of |
| // the copy is so bad it is because higher-level tiling did not do its job, we |
| // do not try to recover from it here. |
| int64_t totalNumThreadsUsed = product(inferredNumThreads); |
| LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed); |
| if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) { |
| LDBG("--Too few threads given the required vector size -> FAIL"); |
| return Status::Invalid; |
| } |
| |
| this->vectorSize = desiredVectorSize; |
| this->numThreads = inferredNumThreads; |
| if (totalNumThreadsUsed == totalNumThreads) |
| return Status::Success; |
| |
| return Status::RequiresPredication; |
| } |
| |
| void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const { |
| os << "MappingInfo{"; |
| os << "CopyMappingInfo: "; |
| os << "valid: " << (status != Status::Invalid) << ", "; |
| os << "vectorSize: " << vectorSize << ", "; |
| llvm::interleaveComma(numThreads, os << ", numThreads: {"); |
| llvm::interleaveComma(smallestBoundingTileSizes, |
| os << "}, smallestBoundingTileSizes: {"); |
| llvm::interleaveComma(threadMapping, os << "}, threadMapping: {"); |
| os << "}}"; |
| } |