| //===- MatmulOptimizer.h -------------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef POLLY_MATMULOPTIMIZER_H |
| #define POLLY_MATMULOPTIMIZER_H |
| |
| #include "isl/isl-noexceptions.h" |
| |
| namespace llvm { |
| class TargetTransformInfo; |
| } |
| |
| namespace polly { |
| struct Dependences; |
| |
| /// Apply the BLIS matmul optimization pattern if possible. |
| /// |
| /// Make the loops containing the matrix multiplication be the innermost |
| /// loops and apply the BLIS matmul optimization pattern. BLIS implements |
| /// gemm as three nested loops around a macro-kernel, plus two packing |
| /// routines. The macro-kernel is implemented in terms of two additional |
| /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1 |
| /// (i.e., outer product) update. |
| /// |
| /// For a detailed description please see [1]. |
| /// |
| /// The order of the loops defines the data reused in the BLIS implementation |
| /// of gemm ([1]). In particular, elements of the matrix B, the second |
| /// operand of matrix multiplication, are reused between iterations of the |
| /// innermost loop. To keep the reused data in cache, only elements of matrix |
| /// A, the first operand of matrix multiplication, should be evicted during |
| /// an iteration of the innermost loop. To provide such a cache replacement |
| /// policy, elements of the matrix A can, in particular, be loaded first and, |
| /// consequently, be least-recently-used. |
| /// |
| /// In our case matrices are stored in row-major order instead of |
| /// column-major order used in the BLIS implementation ([1]). It affects only |
| /// on the form of the BLIS micro kernel and the computation of its |
| /// parameters. In particular, reused elements of the matrix B are |
| /// successively multiplied by specific elements of the matrix A. |
| /// |
| /// Refs.: |
| /// [1] - Analytical Modeling is Enough for High Performance BLIS |
| /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti |
| /// Technical Report, 2014 |
| /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf |
| /// |
| /// @see ScheduleTreeOptimizer::createMicroKernel |
| /// @see ScheduleTreeOptimizer::createMacroKernel |
| /// @see getMicroKernelParams |
| /// @see getMacroKernelParams |
| /// |
| /// TODO: Implement the packing transformation. |
| /// |
| /// @param Node The node that contains a band to be optimized. The node |
| /// is required to successfully pass |
| /// ScheduleTreeOptimizer::isMatrMultPattern. |
| /// @param TTI Target Transform Info. |
| /// @param D The dependencies. |
| /// |
| /// @returns The transformed schedule or nullptr if the optimization |
| /// cannot be applied. |
| isl::schedule_node |
| tryOptimizeMatMulPattern(isl::schedule_node Node, |
| const llvm::TargetTransformInfo *TTI, |
| const Dependences *D); |
| |
| } // namespace polly |
| #endif // POLLY_MATMULOPTIMIZER_H |